setup:

installing

here::i_am("notebooks/aale_resubmission_2021.nb.Rmd")

system('sudo apt-get update -y && sudo apt-get install -y libxt-dev')
# https://stackoverflow.com/questions/23642353/error-message-installing-cairo-package-in-r
# Enables `Cairo` installation, ComplexHeatmap dependency

system(paste0('sudo cp ', 
              here::here('install_bin/bwtool'),
              ' /usr/local/bin/'))
system('sudo apt install -y mysql-server')
system('sudo apt-get install -y libmagick++-6.q16-dev')
system('sudo apt-get install -y libpoppler-cpp-dev')

BiocManager::install(c("DESeq2",
                       "bamsignals",
                       "rGREAT",
                       "MotifDb",
                       "ComplexHeatmap"), update = FALSE, ask = FALSE)

remotes::install_github(repo = "poisonalien/trackplot", upgrade = F)

install.packages(c('magick', 'cowplot', 'Hmisc', 'pdftools', 'svglite'))

loading

suppressPackageStartupMessages({
  library(tidyverse)
  library(here)
  library(ggsci)
  library(ggpubr)
  library(ggsignif)
  library(ggthemes) 
  library(ggrepel)
  library(ggforce)
  library(ggpmisc)
  library(cowplot)
  library(patchwork)
  library(extrafont)
  library(ggrepel)
  library(pheatmap)
  library(viridis)
  library(patchwork)
  library(fgsea)
  library(msigdbr)
  library(matrixStats)
  library(tximeta)
  library(SummarizedExperiment)
  library(rjson)
  library(HDF5Array)
  library(DESeq2)
  library(survival)
  library(survminer)
})
suppressMessages(loadfonts())

theming:

setting ggplot theme

base_size = 10

theme_set(theme_foundation(base_size = base_size,
                           base_family = "Helvetica") + 
            theme(plot.title = element_text(size = rel(0.95), face = 'bold'),
                panel.background = element_rect(colour = NA),
                plot.background = element_rect(colour = NA),
                panel.border = element_rect(colour = NA), 
                axis.line = element_line(), 
                axis.line.x = NULL, 
                axis.line.y = NULL, 
                axis.text = element_text(size = rel(0.8)), 
                axis.title = element_text(size = rel(0.9)),
                axis.text.x = element_text(margin = margin(t = 0.8 * base_size/4), size = rel(0.8)), 
                axis.text.x.top = element_text(margin = margin(b = 0.8 * base_size/4), vjust = 0), 
                axis.text.y = element_text(margin = margin(r = 0.5 * base_size/4), hjust = 1, size = rel(0.8)), 
                axis.text.y.right = element_text(margin = margin(l = 0.5 * base_size/4), hjust = 0), 
                axis.ticks = element_line(), 
                axis.ticks.length = unit(base_size/2.5, "pt"), axis.ticks.length.x = NULL, 
                axis.ticks.length.x.top = NULL, axis.ticks.length.x.bottom = NULL, 
                axis.ticks.length.y = NULL, axis.ticks.length.y.left = NULL, 
                axis.ticks.length.y.right = NULL,
                strip.text = element_text(size = rel(0.8), face = 'bold'),
                strip.background = element_blank(),
                legend.key.size= unit(0.08, "in"),
                legend.spacing = unit(0, "in"),
                legend.key = element_rect(colour = NA),
                legend.title = element_text(face="bold", size = rel(0.8)),
                legend.text = element_text(size = rel(0.75)),
                legend.justification = c("right", "top"),
                legend.box.just = "right",
                legend.margin = margin(1.016, 1.016, 1.016, 1.016, unit = 'mm'),
                plot.margin=margin(2.032,
                                   2.032,
                                   2.032,
                                   2.032,
                                   unit = "mm"),
                plot.tag = element_text(size = rel(0.95), face = 'bold'),
                plot.tag.position = c(0.01, 0.99),
                legend.box.spacing = unit(-0.508, 'mm'),
                panel.grid.major = element_blank(),
                panel.grid.minor = element_blank()
))


theme_set_fun <- function() {
  theme_set(theme_foundation(base_size = base_size,
                           base_family = 'Arial') + 
            theme(plot.title = element_text(size = rel(0.95), face = 'bold'),
                panel.background = element_rect(colour = NA),
                plot.background = element_rect(colour = NA),
                panel.border = element_rect(colour = NA), 
                axis.line = element_line(), 
                axis.line.x = NULL, 
                axis.line.y = NULL, 
                axis.text = element_text(size = rel(0.8)), 
                axis.title = element_text(size = rel(0.9)),
                axis.text.x = element_text(margin = margin(t = 0.8 * base_size/4), size = rel(0.8)), 
                axis.text.x.top = element_text(margin = margin(b = 0.8 * base_size/4), vjust = 0), 
                axis.text.y = element_text(margin = margin(r = 0.5 * base_size/4), hjust = 1, size = rel(0.8)), 
                axis.text.y.right = element_text(margin = margin(l = 0.5 * base_size/4), hjust = 0), 
                axis.ticks = element_line(), 
                axis.ticks.length = unit(base_size/2.5, "pt"), axis.ticks.length.x = NULL, 
                axis.ticks.length.x.top = NULL, axis.ticks.length.x.bottom = NULL, 
                axis.ticks.length.y = NULL, axis.ticks.length.y.left = NULL, 
                axis.ticks.length.y.right = NULL,
                strip.text = element_text(size = rel(0.8), face = 'bold'),
                strip.background = element_blank(),
                legend.key.size= unit(0.08, "in"),
                legend.spacing = unit(0, "in"),
                legend.key = element_rect(colour = NA),
                legend.title = element_text(face="bold", size = rel(0.8)),
                legend.text = element_text(size = rel(0.75)),
                legend.justification = c("right", "top"),
                legend.box.just = "right",
                legend.margin = margin(1.016, 1.016, 1.016, 1.016, unit = 'mm'),
                plot.margin=margin(2.032,
                                   2.032,
                                   2.032,
                                   2.032,
                                   unit = "mm"),
                plot.tag = element_text(size = rel(0.95), face = 'bold'),
                plot.tag.position = c(0.01, 0.99),
                legend.box.spacing = unit(-0.508, 'mm'),
                panel.grid.major = element_blank(),
                panel.grid.minor = element_blank()
))
}

txt.mm_to_pts <- function(ratio){ base_size * ratio * (25.4 / 72.27) } # for reverse can use .pt

figure dirs, and other globally relevant constants

figure.dir <-  here('figures')
data.dir <-  here('data')
output.data.dir <- here('output.data')
reference.dir <- here('reference')
job.scripts.dir <- here('R/job.scripts')

chr.order <- c('chr1', 'chr2', 'chr3', 
               'chr4', 'chr5', 'chr6', 
               'chr7', 'chr8', 'chr9',
               'chr10', 'chr11', 'chr12', 
               'chr13', 'chr14', 'chr15', 
               'chr16', 'chr17',
               'chr18', 'chr19', 'chr20', 
               'chr21', 'chr22', 'chrX', 'chrY')

te_alone_annotation_levels <- c('DNA',
                                'LINE',
                                'LTR',
                                'Satellite',
                                'SINE')

te_alone_color_pal <- viridis(5, direction = -1)
names(te_alone_color_pal) <- te_alone_annotation_levels

saving panels

panel.figure.width = 85
panel.figure.width.medium = 114
panel.figure.width.double = 17
panel.figure.height = 279.4 

patch_fig_width = unit(172, 'mm')
patch_fig_height = unit(279.4 * 0.9 , 'mm')

gene sets for filtering, identification

if (! file.exists(file.path(output.data.dir, 'meta_data_df.rds'))) {
  
  meta_data <- lst()
  
  meta_data$te.clades <- 
  read_csv(file.path(output.data.dir, 'reference.data', 'te.clades.csv'))

  meta_data$te.families.clades <- 
    read_csv(file.path(output.data.dir, 'reference.data','te.family.clade.csv'),
             col_names = c('gene', 'family', 'clade')) %>% 
    distinct()
  
  meta_data$trono.krab.znfs <- 
    read_csv(file.path(output.data.dir, 'krab.znf.trono.list.csv'), 
             col_names = c('gene', 'biotype', 'ensg', 'chr', 'start', 'end', 'strand', 'ZNF'))
  
  meta_data$gen.24.names <- 
    read_csv(file.path(output.data.dir, 'reference.data/gen.24.names.csv'),
             col_names = c('enst','ensg','tx','gene','len','biotype')) %>% 
    select(ensg, gene) %>% 
    distinct()
  
  meta_data$gencode.35.tx.to.gene <- 
    read_csv(file.path(reference.dir, 'gencode.v35.tx.to.gene.csv'), 
                                    col_names = c('tx', 'gene')) %>% 
    separate(tx, sep = '\\|', into = c('enst','ensg',NA,NA,'tx','gene','len','biotype'))
  
  meta_data$gencode.35.tx.to.gene %>% 
    dplyr::select(-enst) %>% 
    group_by(biotype) %>% 
    tally() -> gen.35.biotype.count
  
  meta_data$gencode.35.lnc.gene.names <- 
    read_csv(file.path(reference.dir, 'gencode.v35.lncRNA.gene.names.txt'), 
             col_names = 'gene')
  
  meta_data$ucsc.rmsk.tx.to.gene <-  
    read_csv(file.path(reference.dir, 'gencode.v35.ucsc.rmsk.tx.to.gene.csv'), 
                                    col_names = c('enst', 'gene')) %>%
    filter(grepl('^hg38', enst))
  
  ucsc.rmsk.tx.to.gene %>% 
    mutate(tx = enst) %>% 
    separate(tx, sep = '_', into = c(NA, NA, 'gene', 'position', NA, NA, 'strand', NA)) %>% 
    mutate(position = sub('range=', '', position),
           strand = sub('strand=', '', strand)) %>% 
    merge(te.families.clades, by = 'gene') %>% 
    mutate(tmp.gene = gene,
           ensg = NA) %>% 
    dplyr::select(enst, ensg, 'gene' = tmp.gene, 'len' = clade, 'biotype' = family) -> meta_data$ucsc.rmsk.tx.to.gene.fmt
  
  
  meta_data$tot.gencode.35.tx.to.gene <- 
    read_csv(file.path(reference.dir, 'gencode.v35.tx.to.gene.csv'), 
                                    col_names = c('tx', 'gene')) %>% 
    mutate(enst = tx) %>% 
    separate(tx, sep = '\\|', into = c(NA,'ensg',NA,NA,'tx','gene','len','biotype'))
  
  meta_data$total.tx.to.gene <- 
    rbind(tot.gencode.35.tx.to.gene %>% dplyr::select(enst, ensg, gene, len, biotype),
          ucsc.rmsk.tx.to.gene.fmt %>% dplyr::select(enst = enst, ensg, gene, len, biotype))
  
  
  meta_data$panc_methylation_df <- read_tsv(file.path(output.data.dir, 'GSE119548_avg_beta.txt.gz'))
  
  saveRDS(meta_data, file.path(output.data.dir, 'meta_data_df.rds'))
  
} else {
  meta.data <- readRDS(file.path(output.data.dir, 'meta_data_df.rds'))
}

processed dataset check:

build_salmon.quant <- FALSE

if (file.exists(file.path(output.data.dir, 'salmon.quant_analysis_set.rds'))) { 
  
  salmon.quant <- readRDS(file = file.path(output.data.dir, 'salmon.quant_analysis_set.rds'))
  
  build_salmon.quant <- FALSE

} else {
  
  message('Warning: salmon published dataset not found, will generate equivalent')
  
  build_salmon.quant <- TRUE
  
}

build_ucsc.rmsk.salmon <- FALSE

if (file.exists(file.path(output.data.dir, 'ucsc.rmsk.salmon.quant_analysis_set.rds'))) { 
  
  ucsc.rmsk.salmon.quant <- readRDS(file = file.path(output.data.dir, 'ucsc.rmsk.salmon.quant_analysis_set.rds'))
  
  build_ucsc.rmsk.salmon <- FALSE

} else {
  
  message('Warning: ucsc rmsk salmon published dataset not found, will generate equivalent')
  
  build_ucsc.rmsk.salmon <- TRUE
  
}

eval_build_datasets <- ifelse(build_ucsc.rmsk.salmon == F &
                                build_salmon.quant == F, FALSE, TRUE)

feature engineering:

FUN: construct.quant.object

load.tximeta.object <- function(reference){
  
  ## RERCOMM
  
  print(file.path(output.data.dir, paste0(reference, '_h5_se')))
  
  txi <- loadHDF5SummarizedExperiment(dir=file.path(output.data.dir, paste0(reference, '_h5_se')))
  gxi <- loadHDF5SummarizedExperiment(dir=file.path(output.data.dir, paste0(reference, '_gene_h5_se')))
  
  if (reference == 'process.aware.salmon'){
    
    gxi.split <- loadHDF5SummarizedExperiment(dir=file.path(output.data.dir, paste0(reference, '_gene_split_h5_se')))
    
    return(list('txi' = txi, 'gxi' = gxi, 'gxi.split' = gxi.split))
    
  } else {
    
    return(list('txi' = txi, 'gxi' = gxi))
  }
  
}

subset.tximeta.object <- function(tximeta.list, grep.string){
  
    ## RERCOMM

  
  if (grepl('\\|', grep.string)){
      
    subset.txi <- SummarizedExperiment::subset(tximeta.list$txi, 
                                         select = !grepl(grep.string, colData(tximeta.list$txi)$names))
  
    subset.gxi <- SummarizedExperiment::subset(tximeta.list$gxi, 
                                         select = !grepl(grep.string, colData(tximeta.list$gxi)$names))
  } else{
        
    subset.txi <- SummarizedExperiment::subset(tximeta.list$txi, 
                                         select = grepl(grep.string, colData(tximeta.list$txi)$names))
  
    subset.gxi <- SummarizedExperiment::subset(tximeta.list$gxi, 
                                         select = grepl(grep.string, colData(tximeta.list$gxi)$names))
  }
  
  return(list('txi' = subset.txi,
              'gxi' = subset.gxi))
}

CALL: construct.quant.object

if (build_salmon.quant == T) { 
  
  salmon.quant <- load.tximeta.object('salmon')
  salmon.quant$aale$intra <- subset.tximeta.object(salmon.quant, 'intra.aale')
  salmon.quant$hbec$hbec_wt <- subset.tximeta.object(salmon.quant, 'wt2_kras')
  salmon.quant$hbec$hbec_v12 <- subset.tximeta.object(salmon.quant, '_v12')
  salmon.quant$hbec$lacz_v_wt <- subset.tximeta.object(salmon.quant, '_v12|mut2|aale|ZNF|Empty')
  salmon.quant$hbec$lacz_v_v12 <- subset.tximeta.object(salmon.quant, '_wt|mut2|aale|ZNF|Empty')
  salmon.quant$a549 <- subset.tximeta.object(salmon.quant, 'intra.a549')
  salmon.quant$all_intra <- subset.tximeta.object(salmon.quant, 'rnaEASY|hold')
  salmon.quant$aale$exo <- subset.tximeta.object(salmon.quant, 'rnaEASY.aale')
  salmon.quant$aale$kras_compare <- subset.tximeta.object(salmon.quant, 'aale.*.kras')
  salmon.quant$aale$ctrl_compare <- subset.tximeta.object(salmon.quant, 'aale.*.ctrl')
  salmon.quant$aale$platform_compare <- subset.tximeta.object(salmon.quant, 'aale')
  
  colData(salmon.quant$gxi) %>% 
    as.data.frame() %>% 
    remove_rownames() -> salmon.quant$meta.data.df
  
  rowRanges(salmon.quant$gxi) %>% 
    as.data.frame() %>% 
    dplyr::rename('chr' = seqnames) %>% 
    merge(meta.data$gencode.35.tx.to.gene %>% select(ensg, gene) %>% distinct(), by.x = 'gene_id', by.y = 'ensg') -> 
    salmon.quant$gxi.bed
  
  rowRanges(salmon.quant$txi) %>% 
    as.data.frame() %>% 
    dplyr::rename('chr' = seqnames) %>% 
    merge(meta.data$gencode.35.tx.to.gene %>% select(enst, gene) %>% distinct(), by.x = 'tx_name', by.y = 'enst') -> 
    salmon.quant$txi.bed
}


# ------------------------------------------------------------------------------

if (build_ucsc.rmsk.salmon == T) { 

  ucsc.rmsk.salmon.quant <- load.tximeta.object('ucsc.rmsk.salmon')
  
  ucsc.rmsk.salmon.quant$aale$intra <- 
    subset.tximeta.object(ucsc.rmsk.salmon.quant, 'intra.aale')
  ucsc.rmsk.salmon.quant$hbec$hbec_wt <- 
    subset.tximeta.object(ucsc.rmsk.salmon.quant, 'wt2_kras')
  ucsc.rmsk.salmon.quant$hbec$hbec_v12 <- 
    subset.tximeta.object(ucsc.rmsk.salmon.quant, '_v12')
  ucsc.rmsk.salmon.quant$hbec$lacz_v_wt <- 
    subset.tximeta.object(ucsc.rmsk.salmon.quant, '_v12|mut2|aale|ZNF|Empty')
  ucsc.rmsk.salmon.quant$hbec$lacz_v_v12 <- 
    subset.tximeta.object(ucsc.rmsk.salmon.quant, '_wt|mut2|aale|ZNF|Empty')
  ucsc.rmsk.salmon.quant$a549 <- 
    subset.tximeta.object(ucsc.rmsk.salmon.quant, 'intra.a549')
  ucsc.rmsk.salmon.quant$all_intra <- 
    subset.tximeta.object(ucsc.rmsk.salmon.quant, 'rnaEASY|hold')
  ucsc.rmsk.salmon.quant$aale$exo <- 
    subset.tximeta.object(ucsc.rmsk.salmon.quant, 'rnaEASY.aale')
  ucsc.rmsk.salmon.quant$aale$kras_compare <- 
    subset.tximeta.object(ucsc.rmsk.salmon.quant, 'aale.*.kras')
  ucsc.rmsk.salmon.quant$aale$ctrl_compare <- 
    subset.tximeta.object(ucsc.rmsk.salmon.quant, 'aale.*.ctrl')
  ucsc.rmsk.salmon.quant$aale$platform_compare <- 
    subset.tximeta.object(ucsc.rmsk.salmon.quant, 'aale')
  
  colData(ucsc.rmsk.salmon.quant$gxi) %>% 
    as.data.frame() %>% 
    remove_rownames() -> ucsc.rmsk.salmon.quant$meta.data.df
}

FUN: run.de.seq

run.de.seq <- function(tximeta.object, base_level = 'ctrl'){
  
  tximeta.name <- deparse(substitute(tximeta.object))
  
  print(tximeta.name)
  
  colData(tximeta.object$gxi) %>% 
    as.data.frame() %>% 
    remove_rownames() %>% 
    mutate(condition = relevel(as.factor(condition), base_level)) -> colData.gxi
  
  print(levels(colData.gxi$condition))
  
  print(colData.gxi)

  count.matrix.gxi <- assay(tximeta.object$gxi, 'counts') %>% 
    as.data.frame() %>% 
    rownames_to_column('gene') %>% 
    mutate_if(is.numeric, round) %>% 
    column_to_rownames('gene')
    
  colData(tximeta.object$txi) %>% 
    as.data.frame() %>% 
    remove_rownames() -> colData.txi

  count.matrix.txi <- assay(tximeta.object$txi, 'counts') %>% 
    as.data.frame() %>% 
    rownames_to_column('tmp') %>% 
    mutate_if(is.numeric, round) %>% 
    column_to_rownames('tmp')
  
  if (grepl('compare', tximeta.name)) {
    
    colData.gxi$platform <- relevel(as.factor(colData.gxi$platform), 'intra')
    
    dds.gxi <- DESeqDataSetFromMatrix(countData = count.matrix.gxi, 
                                    colData = colData.gxi, 
                                    design = as.formula(~platform))

    dds.txi <- DESeqDataSetFromMatrix(countData = count.matrix.txi, 
                                      colData = colData.txi, 
                                      design = as.formula(~platform))
    
  } else {
    
    dds.gxi <- DESeqDataSetFromMatrix(countData = count.matrix.gxi, 
                                    colData = colData.gxi, 
                                    design = as.formula(~condition))

    dds.txi <- DESeqDataSetFromMatrix(countData = count.matrix.txi, 
                                      colData = colData.txi, 
                                      design = as.formula(~condition))
    
  }
  
  dds.txi <- estimateSizeFactors(dds.txi)
  
  dds.txi.norm.counts <- as.data.frame(counts(dds.txi, normalized=T))
  
  dds.txi.vst.counts <- as.data.frame(assay(vst(dds.txi, blind = F)))
  
  dds.gxi <- estimateSizeFactors(dds.gxi)
  
  dds.gxi.norm.counts <- as.data.frame(counts(dds.gxi, normalized=T))
  
  dds.gxi.vst.counts <-  as.data.frame(assay(vst(dds.gxi, blind = F)))
  
  if (grepl('all_intra|a549', tximeta.name)){
    print('no DE calc')
      if (grepl('ucsc.rmsk', tximeta.name)){  
    
        print('rmsk')
    
        dds.gxi.norm.counts <- dds.gxi.norm.counts %>% 
          rownames_to_column('gene') 
        
        return(list('counts' = dds.gxi.norm.counts,
                    'txi.counts' = dds.txi.norm.counts,
                    'gxi.vst.counts' = dds.gxi.vst.counts,
                    'col.data' = colData.txi))
        
        quit()
      }
    
      dds.gxi.norm.counts <- 
        dds.gxi.norm.counts %>% 
          rownames_to_column('gene_id') %>% 
          merge(salmon.quant$gxi.bed, by = 'gene_id') %>% 
          distinct()
      
      dds.txi.norm.counts <- 
        dds.gxi.norm.counts %>% 
          rownames_to_column('tx_name') %>% 
          merge(salmon.quant$txi.bed, by = 'tx_name') %>% 
          distinct() 
    
      return(list('counts' = dds.gxi.norm.counts, 
                  'txi.counts' = dds.txi.norm.counts,
                  'gxi.vst.counts' = dds.gxi.vst.counts,
                  'col.data' = colData.txi))
      quit()

  }
  
  filter <- rowSums(counts(dds.gxi, normalized=T) >= 10) >= 2
  
  dds.gxi <- DESeq(dds.gxi[filter,])
  
  print(resultsNames(dds.gxi))
  
  coef = print(tail(resultsNames(dds.gxi), n=1))

  if (grepl('ucsc.rmsk', tximeta.name)){  
    
    print('rmsk')
    
    dds.gxi.lfc <- lfcShrink(dds.gxi, coef = coef) %>% 
      as.data.frame() %>% 
      # filter(padj <= 0.05) %>% 
      rownames_to_column('gene') %>% 
      dplyr::select(gene, baseMean, log2FoldChange, padj)
    
    dds.gxi.norm.counts <- dds.gxi.norm.counts %>% 
      rownames_to_column('gene') 
    
  } else{
    
    dds.gxi.lfc <- lfcShrink(dds.gxi, coef = coef) %>% 
      as.data.frame() %>% 
      # filter(padj <= 0.05) %>% 
      rownames_to_column('gene_id') %>% 
      merge(salmon.quant$gxi.bed, by = 'gene_id')
    
    plotMA(lfcShrink(dds.gxi, coef = coef))
    
    dds.gxi.norm.counts <- 
      dds.gxi.norm.counts %>% 
        rownames_to_column('gene_id') %>% 
        merge(salmon.quant$gxi.bed, by = 'gene_id') %>% 
        distinct()
    
    dds.txi.norm.counts <- 
      dds.gxi.norm.counts %>% 
        rownames_to_column('tx_name') %>% 
        merge(salmon.quant$txi.bed, by = 'tx_name') %>% 
        distinct() 
  }

return(list('de' = dds.gxi.lfc, 
            'counts' = dds.gxi.norm.counts, 
            'txi.counts' = dds.txi.norm.counts, 
            'txi.vst.counts' = dds.txi.vst.counts,
            'gxi.vst.counts' = dds.gxi.vst.counts,
            'col.data' = colData.txi))
  
}

run.de.seq.A549.contrast <- function(tximeta.object, base_level = 'Empty'){
  
  tximeta.name <- deparse(substitute(tximeta.object))
  
  print(tximeta.name)
  
  colData(tximeta.object$gxi) %>% 
    as.data.frame() %>% 
    remove_rownames() %>% 
    mutate(condition = relevel(as.factor(condition), base_level)) -> colData.gxi
  
  print(levels(colData.gxi$condition))
  
  print(colData.gxi)

  count.matrix.gxi <- assay(tximeta.object$gxi, 'counts') %>% 
    as.data.frame() %>% 
    rownames_to_column('gene') %>% 
    mutate_if(is.numeric, round) %>% 
    column_to_rownames('gene')
    
  colData(tximeta.object$txi) %>% 
    as.data.frame() %>% 
    remove_rownames() -> colData.txi

  count.matrix.txi <- assay(tximeta.object$txi, 'counts') %>% 
    as.data.frame() %>% 
    rownames_to_column('tmp') %>% 
    mutate_if(is.numeric, round) %>% 
    column_to_rownames('tmp')
  
  # colData.gxi$platform <- relevel(as.factor(colData.gxi$platform), 'intra')
  
  dds.gxi <- DESeqDataSetFromMatrix(countData = count.matrix.gxi, 
                                  colData = colData.gxi, 
                                  design = as.formula(~condition))

  # dds.txi <- DESeqDataSetFromMatrix(countData = count.matrix.txi, 
  #                                   colData = colData.txi, 
  #                                   design = as.formula(~platform))
  
  dds.gxi <- estimateSizeFactors(dds.gxi)
  
  filter <- rowSums(counts(dds.gxi, normalized=T) >= 10) >= 2
  
  dds.gxi <- DESeq(dds.gxi[filter,])
  
  print(resultsNames(dds.gxi))
  
  # results(dds.gxi, contrast =list("conditionkras.platformrnaEASY"))
  # 
  dds.682.gxi.lfc <- lfcShrink(dds.gxi, coef = "condition_ZNF682_vs_Empty") %>%
    as.data.frame() %>%
    # filter(padj <= 0.05) %>%
    rownames_to_column('gene_id') #%>%
    # merge(salmon.quant$gxi.bed, by = 'gene_id')
  
  dds.257.gxi.lfc <- lfcShrink(dds.gxi, coef = "condition_ZNF257_vs_Empty") %>%
    as.data.frame() %>%
    # filter(padj <= 0.05) %>%
    rownames_to_column('gene_id') #%>%
    # merge(salmon.quant$gxi.bed, by = 'gene_id')

  return(list('de_682' = dds.682.gxi.lfc, 
              'de_257' = dds.257.gxi.lfc,
              'col.data' = colData.txi))
  
  
}

run.de.seq.contrast <- function(tximeta.object, base_level = 'ctrl'){
  
  tximeta.name <- deparse(substitute(tximeta.object))
  
  print(tximeta.name)
  
  colData(tximeta.object$gxi) %>% 
    as.data.frame() %>% 
    remove_rownames() %>% 
    mutate(condition = relevel(as.factor(condition), base_level)) -> colData.gxi
  
  print(levels(colData.gxi$condition))
  
  print(colData.gxi)

  count.matrix.gxi <- assay(tximeta.object$gxi, 'counts') %>% 
    as.data.frame() %>% 
    rownames_to_column('gene') %>% 
    mutate_if(is.numeric, round) %>% 
    column_to_rownames('gene')
    
  colData(tximeta.object$txi) %>% 
    as.data.frame() %>% 
    remove_rownames() -> colData.txi

  count.matrix.txi <- assay(tximeta.object$txi, 'counts') %>% 
    as.data.frame() %>% 
    rownames_to_column('tmp') %>% 
    mutate_if(is.numeric, round) %>% 
    column_to_rownames('tmp')
  
  # colData.gxi$platform <- relevel(as.factor(colData.gxi$platform), 'intra')
  
  # dds.gxi <- DESeqDataSetFromMatrix(countData = count.matrix.gxi, 
  #                                 colData = colData.gxi, 
  #                                 design = as.formula(~condition + 
  #                                                       platform + 
  #                                                       platform:condition))

  dds.txi <- DESeqDataSetFromMatrix(countData = count.matrix.txi,
                                    colData = colData.txi,
                                    design = as.formula(~platform))
  
  dds.gxi <- estimateSizeFactors(dds.gxi)
  
  filter <- rowSums(counts(dds.gxi, normalized=T) >= 10) >= 2
  
  dds.gxi <- DESeq(dds.gxi[filter,])
  
  print(resultsNames(dds.gxi))
  
  # results(dds.gxi, contrast =list("conditionkras.platformrnaEASY"))
  # 
 results(dds.gxi, contrast =list("conditionkras.platformrnaEASY"))

 ctrl.dds.gxi.lfc <- lfcShrink(dds.gxi, coef = "conditionkras.platformrnaEASY") %>%
   as.data.frame() %>%
   # filter(padj <= 0.05) %>%
   rownames_to_column('gene_id') %>%
   merge(salmon.quant$gxi.bed, by = 'gene_id')
 
 ctrl.dds.gxi.lfc <- lfcShrink(dds.gxi, coef = "conditionctrl.platformrnaEASY") %>%
   as.data.frame() %>%
   # filter(padj <= 0.05) %>%
   rownames_to_column('gene_id') %>%
   merge(salmon.quant$gxi.bed, by = 'gene_id')

  return(list('ctrl' = ctrl.dds.gxi.lfc, 
              'kras' = ctrl.dds.gxi.lfc,
              'col.data' = colData.txi))
  
  
}

CALL: run.de.seq

if (build_salmon.quant == T) { 

  salmon.quant$aale$intra$deseq <- run.de.seq(salmon.quant$aale$intra, base_level = 'ctrl')
  salmon.quant$aale$exo$deseq <- run.de.seq(salmon.quant$aale$exo, base_level = 'ctrl')
  salmon.quant$aale$kras_compare$deseq <- run.de.seq(salmon.quant$aale$kras_compare, base_level = 'kras')
  salmon.quant$aale$ctrl_compare$deseq <- run.de.seq(salmon.quant$aale$ctrl_compare, base_level = 'ctrl')
  salmon.quant$aale$platform_compare$deseq <- run.de.seq(salmon.quant$aale$platform_compare, base_level = 'ctrl')
  
  salmon.quant$hbec$lacz_v_v12$deseq <- run.de.seq(salmon.quant$hbec$lacz_v_v12, base_level = 'wt2_lacz')
  salmon.quant$a549$deseq <- run.de.seq(salmon.quant$a549, base_level = 'Empty')
  salmon.quant$all_intra$deseq <- run.de.seq(salmon.quant$all_intra)
  
}


# ------------------------------------------------------------------------------
if (build_ucsc.rmsk.salmon == T) { 

  ucsc.rmsk.salmon.quant$aale$intra$deseq <- run.de.seq(ucsc.rmsk.salmon.quant$aale$intra, base_level = 'ctrl')
  ucsc.rmsk.salmon.quant$aale$exo$deseq <- run.de.seq(ucsc.rmsk.salmon.quant$aale$exo, base_level = 'ctrl')
  ucsc.rmsk.salmon.quant$aale$kras_compare$deseq <- 
    run.de.seq(ucsc.rmsk.salmon.quant$aale$kras_compare, base_level = 'kras')
  ucsc.rmsk.salmon.quant$aale$ctrl_compare$deseq <- 
    run.de.seq(ucsc.rmsk.salmon.quant$aale$ctrl_compare, base_level = 'ctrl')
  ucsc.rmsk.salmon.quant$aale$platform_compare$deseq <- 
    run.de.seq(ucsc.rmsk.salmon.quant$aale$platform_compare, base_level = 'ctrl')
  
  ucsc.rmsk.salmon.quant$hbec$lacz_v_v12$deseq <- run.de.seq(ucsc.rmsk.salmon.quant$hbec$lacz_v_v12, base_level = 'wt2_lacz')
  ucsc.rmsk.salmon.quant$hbec$a549$deseq <- run.de.seq(ucsc.rmsk.salmon.quant$a549, base_case = 'Empty')
  ucsc.rmsk.salmon.quant$all_intra$deseq <- run.de.seq(ucsc.rmsk.salmon.quant$all_intra)
  
  saveRDS(ucsc.rmsk.salmon.quant, file = file.path(output.data.dir, 'ucsc.rmsk.salmon.quant_analysis_set.rds'))

}

CALL: a549 de seq

a549_de <- run.de.seq.A549.contrast(salmon.quant$a549, base_level = 'Empty')

a549_te_de <- run.de.seq.A549.contrast(ucsc.rmsk.salmon.quant$a549, base_level = 'Empty')

FUN: plot_pca

plot_pca <- function(counts, col.data) { 
  set.seed(234)
  ### Gencode PCA
  
  counts %>% apply(., 1, sd) -> pca_sd
  counts[!pca_sd == 0, ] -> pca_filt
  col.data -> pca_col_data
  
  pca_filt %>%
    t() %>% 
    as.data.frame() %>% 
    prcomp(scale=T) -> pca_out_raw
  
  pcs.of.interest <- apply(pca_out_raw$x, 2, var)
  pcs.props <-  pcs.of.interest/sum(pcs.of.interest)

  # convert output to dataframe
  pca.out <- as.data.frame(pca_out_raw$x) %>% 
    rownames_to_column('names') %>% 
    merge(pca_col_data, by ='names')
  # summarize PC contributions
  pca.out.summary <- 
    as.data.frame(summary(pca_out_raw)$importance) %>% 
    rownames_to_column('metric') %>% 
    gather(pc, value, -metric) %>% 
    spread(metric, value)
  # summarize PC contributions
  pca.out.contributions <- 
    as.data.frame(pca_out_raw$rotation) %>% 
    rownames_to_column('tx') %>% 
    gather(pc, contrib, -tx) %>% 
    #merge(gencode.reference,
    #      by.x = 'tx',
    #      by.y = 'gene.name') %>% 
    group_by(pc) %>% 
    mutate(contrib = abs(contrib)) %>% 
    filter(contrib > quantile(contrib, 0.95))
  
  return(list('pca.out' = pca.out,
              'pcs.props' = pcs.props))
}

CALL: plot_pca

if (build_salmon.quant == T) { 

  salmon.quant$aale$intra$deseq$pca <- plot_pca(salmon.quant$aale$intra$deseq$gxi.vst.counts,
                                                salmon.quant$aale$intra$deseq$col.data)
  
  
  salmon.quant$aale$exo$deseq$pca <- plot_pca(salmon.quant$aale$exo$deseq$gxi.vst.counts,
                                                salmon.quant$aale$exo$deseq$col.data)
}

FUN: make.fgsea.ranks

make.fgsea.ranks <- function(de.seq){
  
  # build a ranked gene list in the appropriate format (named list)
  # using shrunken log2 Fold Change values from DESeqII
  de.seq <- 
    de.seq %>%
    mutate(rank = as.double(log2FoldChange),
           Gene = as.character(gene)) %>% 
    dplyr::select(Gene, rank) %>%
    mutate(rank = scale(rank)) 
  
  de.seq.rnk <- 
    de.seq$rank
  de.seq.rnk <- 
    setNames(de.seq$rank, de.seq$Gene)

  # run the GSEA implementation on the hallmark sets supplemented with custom sets
  print('fgsea')
  de.seq.fgsea.res <- 
    fgsea(pathways = msig.df, 
          stats = de.seq.rnk, eps = 0.0)
  
  print('padj filter and clean names')
  # convert to a tidy format, clean the names for display
  de.seq.fgsea.df <-
    as_tibble(de.seq.fgsea.res) %>%
    filter(padj < 0.05) %>%
    mutate(pathway = gsub('_', ' ', pathway))

  
  return(de.seq.fgsea.df)
  
}

CALL: make.fgsea.ranks

  msig.df <- 
    msigdbr(species = 'Homo sapiens', category = 'H') %>% 
    dplyr::select(gene_symbol, gs_name) %>% 
    split(x = .$gene_symbol, f = .$gs_name)
  
  msig.df$HALLMARK_KRAS_G12D_ZNF_DN <- salmon.quant$aale$intra$deseq$de %>% 
    filter(gene %in% filter(meta.data$trono.krab.znfs, ZNF == 'KRAB-ZNFs')$gene,
           log2FoldChange <= -1) %>% 
    dplyr::select(gene) %>% unlist() %>% unname()
  
if (build_salmon.quant == T) { 

  salmon.quant$aale$intra$deseq$H.fgsea <- make.fgsea.ranks(salmon.quant$aale$intra$deseq$de)
  salmon.quant$aale$exo$deseq$H.fgsea <- make.fgsea.ranks(salmon.quant$aale$exo$deseq$de)
  salmon.quant$aale$kras_compare$deseq$H.fgsea <- make.fgsea.ranks(salmon.quant$aale$kras_compare$deseq$de)
  salmon.quant$aale$ctrl_compare$deseq$H.fgsea <- make.fgsea.ranks(salmon.quant$aale$ctrl_compare$deseq$de)
  salmon.quant$aale$platform_compare$deseq$H.fgsea <- make.fgsea.ranks(salmon.quant$aale$platform_compare$deseq$de)
  salmon.quant$hbec$hbec_wt$deseq$H.fgsea <- make.fgsea.ranks(salmon.quant$hbec$hbec_wt$deseq$de)
  salmon.quant$hbec$hbec_v12$deseq$H.fgsea <- make.fgsea.ranks(salmon.quant$hbec$hbec_v12$deseq$de)
  salmon.quant$hbec$lacz_v_wt$deseq$H.fgsea <-make.fgsea.ranks(salmon.quant$hbec$lacz_v_wt$deseq$de)
  salmon.quant$hbec$lacz_v_v12$deseq$H.fgsea <- make.fgsea.ranks(salmon.quant$hbec$lacz_v_v12$deseq$de)
  
  msig.df <- 
    msigdbr(species = 'Homo sapiens') %>% 
    dplyr::select(gene_symbol, gs_name) %>% 
    filter(!grepl('HALLMARK', gs_name)) %>% 
    split(x = .$gene_symbol, f = .$gs_name)
  
  salmon.quant$aale$intra$deseq$A.fgsea <- make.fgsea.ranks(salmon.quant$aale$intra$deseq$de)
  salmon.quant$aale$exo$deseq$A.fgsea <- make.fgsea.ranks(salmon.quant$aale$exo$deseq$de)
  salmon.quant$aale$kras_compare$deseq$A.fgsea <- make.fgsea.ranks(salmon.quant$aale$kras_compare$deseq$de)
  salmon.quant$aale$ctrl_compare$deseq$A.fgsea <- make.fgsea.ranks(salmon.quant$aale$ctrl_compare$deseq$de)
  salmon.quant$aale$platform_compare$deseq$A.fgsea <- make.fgsea.ranks(salmon.quant$aale$platform_compare$deseq$de)
  salmon.quant$hbec$hbec_wt$deseq$A.fgsea <- make.fgsea.ranks(salmon.quant$hbec$hbec_wt$deseq$de)
  salmon.quant$hbec$hbec_v12$deseq$A.fgsea <- make.fgsea.ranks(salmon.quant$hbec$hbec_v12$deseq$de)
  salmon.quant$hbec$lacz_v_wt$deseq$A.fgsea <- make.fgsea.ranks(salmon.quant$hbec$lacz_v_wt$deseq$de)
  salmon.quant$hbec$lacz_v_v12$deseq$A.fgsea <- make.fgsea.ranks(salmon.quant$hbec$lacz_v_v12$deseq$de)
  
  saveRDS(salmon.quant, file = file.path(output.data.dir, 'salmon.quant_analysis_set.rds'))
}

generate FIMO output

library(GenomicRanges)
library(BSgenome.Hsapiens.UCSC.hg38)
library(BSgenome)

ifn_g <- msig.df$HALLMARK_INTERFERON_ALPHA_RESPONSE
ifn_a <- msig.df$HALLMARK_INTERFERON_GAMMA_RESPONSE
jak_stat <- msig.df$HALLMARK_IL6_JAK_STAT3_SIGNALING

options(meme_db = file.path(output.data.dir, 'reference.data', 
                            'HOCOMOCOv11_core_HUMAN_mono_meme_format.meme'))



salmon.quant$aale$intra$deseq$de %>% 
  filter(padj < 0.05, log2FoldChange > 1.5, gene %in% c(ifn_g, ifn_a, jak_stat)) %>% 
  # filter(gene %in% filter(salmon.quant$hbec$lacz_v_v12$deseq$de, padj < 0.05, log2FoldChange > 0.5)$gene) %>%
  select(gene_id) %>% unlist() %>% unname() -> de_list

salmon.quant$aale$intra$deseq$de %>% 
  filter(padj > 0.05, gene %in% c(ifn_g, ifn_a, jak_stat)) %>% 
  # filter(gene %in% filter(salmon.quant$hbec$lacz_v_v12$deseq$de, padj > 0.05, log2FoldChange > 0)$gene) %>%
  select(gene_id) %>% unlist() %>% unname() -> bg_de_list

salmon.quant$aale$intra$deseq$de %>% 
  filter(padj < 0.05, log2FoldChange > 0, gene %in% c(ifn_g, ifn_a), !gene_id %in% c(de_list)) %>% 
  filter(gene %in% filter(salmon.quant$hbec$lacz_v_v12$deseq$de, padj < 0.05, log2FoldChange < 0)$gene) %>%
  select(gene_id) %>% unlist() %>% unname() -> de_list_w_hbec


# salmon.quant$hbec$lacz_v_v12$deseq$de %>% 
#   filter(padj < 0.05, log2FoldChange > 0) %>% #, gene %in% c(ifn_g, ifn_a)) %>% 
#   filter(!gene %in% filter(salmon.quant$aale$intra$deseq$de, padj < 0.05, log2FoldChange > 0)$gene) %>% 
#   select(gene_id) %>% unlist() %>% unname() -> de_list_w_hbec

gr <- rowRanges(salmon.quant$gxi)

ifn_de_gr <- gr[names(gr) %in% de_list]
ng_de_gr <- gr[names(gr) %in% bg_de_list]

promoter_seqs <- 
  GenomicFeatures::getPromoterSeq(subject = BSgenome.Hsapiens.UCSC.hg38, 
                                  query = ifn_de_gr, upstream = 500, downstream = 500)

promoter_seqs_hbec <- 
  GenomicFeatures::getPromoterSeq(subject = BSgenome.Hsapiens.UCSC.hg38, 
                                   query = ng_de_gr, upstream = 500, downstream = 500)


# dreme_results <- memes::runMeme(input = promoter_seqs, control = NA)
# 
tomTom_res <- memes::runAme(control = promoter_seqs_hbec,
                            input = promoter_seqs)

tomTom_res %>% 
  mutate(tmp = motif_id) %>% 
  separate(tmp, sep = '_', into = c('TF', NA)) %>% 
  mutate(TF = str_replace(TF, pattern = '^ZN', replacement = 'ZNF'),
         TF = str_replace(TF, pattern = 'ZNFF', replacement = 'ZNF'),
         TF = str_replace(TF, pattern = '^NFAC', replacement = 'NFATC')) -> ame_tf


isre_motif <- MotifDb::query(MotifDb::MotifDb, "ISRE") %>% 
  universalmotif::convert_motifs()

isre_motif <- isre_motif[[1]]

irf9_motif <- MotifDb::query(MotifDb::MotifDb, "IRF9_HUMAN.H11MO.0.C") %>% 
  universalmotif::convert_motifs()

irf7_motif <- MotifDb::query(MotifDb::MotifDb, "IRF7_HUMAN.H11MO.0.C") %>% 
  universalmotif::convert_motifs()

irf1_motif <- MotifDb::query(MotifDb::MotifDb, "IRF1_HUMAN.H11MO.0.A") %>% 
  universalmotif::convert_motifs()

fos_motif <- MotifDb::query(MotifDb::MotifDb, "FOS") %>% 
  universalmotif::convert_motifs()

foxp2_motif <- MotifDb::query(MotifDb::MotifDb, 'FOXP2_HUMAN.H11MO.0.C') %>% 
  universalmotif::convert_motifs()


irf1_fimo_results <- memes::runFimo(promoter_seqs, irf1_motif, thresh = 1e-4) %>% 
  as.data.frame() %>% 
  merge(salmon.quant$aale$intra$deseq$de, by.x = 'seqnames', by.y = 'gene_id')
  # select(gene) %>% distinct()

irf7_fimo_results <- memes::runFimo(promoter_seqs, irf7_motif, thresh = 1e-4) %>% 
  as.data.frame() %>% 
  merge(salmon.quant$aale$intra$deseq$de, by.x = 'seqnames', by.y = 'gene_id')

irf9_fimo_results <- memes::runFimo(promoter_seqs, irf9_motif, thresh = 1e-4) %>% 
  as.data.frame() %>% 
  merge(salmon.quant$aale$intra$deseq$de, by.x = 'seqnames', by.y = 'gene_id') 

fos_fimo_results <- memes::runFimo(promoter_seqs, fos_motif, thresh = 1e-6) %>% 
  as.data.frame() %>% 
  merge(salmon.quant$aale$intra$deseq$de, by.x = 'seqnames', by.y = 'gene_id') 
  # select(gene) %>% distinct()

foxp2_fimo_results <- memes::runFimo(promoter_seqs, foxp2_motif, thresh = 1e-3) %>% 
  as.data.frame() %>% 
  merge(salmon.quant$aale$intra$deseq$de, by.x = 'seqnames', by.y = 'gene_id') 
  # select(gene) %>% distinct()

irf9_fimo_results <- memes::runFimo(promoter_seqs, irf9_motif, thresh = 1e-3) %>% 
  as.data.frame() %>% 
  merge(salmon.quant$aale$intra$deseq$de, by.x = 'seqnames', by.y = 'gene_id') 
  # select(gene) %>% distinct()

ATAC

datasets

kras_atac_anno_peaks <- 
  read_tsv(file.path(output.data.dir, 
                     'narrow_output.data/bwa/mergedLibrary/macs/narrowPeak/kras_R1.mLb.clN_peaks.annotatePeaks.txt'))

ctrl_atac_anno_peaks <- 
  read_tsv(file.path(output.data.dir, 
                     'narrow_output.data/bwa/mergedLibrary/macs/narrowPeak/ctrl_R1.mLb.clN_peaks.annotatePeaks.txt'))

consensus_atac_anno_peak_counts <- 
  read_tsv(file.path(output.data.dir, 
                     'narrow_output.data/bwa/mergedLibrary/macs/narrowPeak/consensus/consensus_peaks.mLb.clN.featureCounts.txt'),
           skip = 1)

consensus_atac_boolean_anno <- 
  read_tsv(file.path(output.data.dir, 
                     'narrow_output.data/bwa/mergedLibrary/macs/narrowPeak/consensus/consensus_peaks.mLb.clN.boolean.annotatePeaks.txt'
                     ))

parsed_datasets & GREAT

irf9_fimo_results %>%
  mutate(start.x = start.y + start.x,
         end.x = start.y + end.x) %>% 
  select(chr, start = start.x, end = end.x, strand = strand.x, gene, motif_id) %>% 
  unite(col = 'name', sep = '_', c(gene, motif_id)) %>% 
  makeGRangesFromDataFrame(keep.extra.columns = T) -> irf9_bed

consensus_atac_anno_peak_counts %>% 
  select(chr = Chr, start = Start, end = End, strand = Strand, name = Geneid) %>% 
  makeGRangesFromDataFrame(keep.extra.columns = T) -> consensus_bed

consensus_atac_boolean_anno %>% 
  filter(ctrl_R1.mLb.clN.bool == F) %>% 
  select(chr, start, end, name = interval_id) %>% 
  makeGRangesFromDataFrame(keep.extra.columns = T) -> kras_atac_bed

consensus_atac_boolean_anno %>% 
  filter(ctrl_R1.mLb.clN.bool == F) %>% 
  separate(Annotation, sep = '\\(', into = c('annotation', NA)) %>% 
  filter(annotation == "promoter-TSS ") %>%
  group_by(`Gene Name`) %>%
  tally() -> uniq_kras_peak_genes

consensus_atac_boolean_anno %>% 
  filter(ctrl_R1.mLb.clN.bool == F) %>% 
  filter(`Gene Name` %in% filter(salmon.quant$aale$intra$deseq$de, padj < 0.05, log2FoldChange > 1.5)$gene) %>%
  filter(`Gene Name` %in% c(ifn_g, ifn_a)) %>%
  separate(Annotation, sep = '\\(', into = c('annotation', NA)) %>% 
  filter(annotation == "promoter-TSS ") %>%
  group_by(`Gene Name`) %>%
  tally() -> uniq_ifn_kras_peak_genes

consensus_atac_boolean_anno %>% 
  filter(kras_R1.mLb.clN.bool == F) %>% 
  separate(Annotation, sep = '\\(', into = c('annotation', NA)) %>% 
  filter(annotation == "promoter-TSS ") %>%
  group_by(`Gene Name`) %>%
  tally() -> uniq_ctrl_peak_genes

consensus_atac_boolean_anno %>% 
  separate(Annotation, sep = '\\(', into = c('annotation', NA)) %>% 
  select(chr, start, end, gene = `Gene Name`) %>% 
  mutate(strand = '+') -> bg_peak_input_bed

consensus_atac_boolean_anno %>% 
  filter(ctrl_R1.mLb.clN.bool == F) %>% 
  filter(`Gene Name` %in% filter(salmon.quant$aale$intra$deseq$de, padj < 0.05, log2FoldChange > 1.5)$gene) %>%
  separate(Annotation, sep = '\\(', into = c('annotation', NA)) %>% 
  select(chr, start, end, gene = `Gene Name`) %>% 
  mutate(strand = '+') -> uniq_kras_peak_input_bed

uniq_kras_great_job <- rGREAT::submitGreatJob(gr = uniq_kras_peak_input_bed, species = 'hg38', bg = bg_peak_input_bed)

uniq_kras_great_tb <- rGREAT::getEnrichmentTables(uniq_kras_great_job, download_by = 'tsv')

consensus_atac_boolean_anno %>% 
  filter(kras_R1.mLb.clN.bool == F) %>% 
  filter(`Gene Name` %in% filter(salmon.quant$aale$intra$deseq$de, padj < 0.05, log2FoldChange < -1.5)$gene) %>%
  separate(Annotation, sep = '\\(', into = c('annotation', NA)) %>% 
  select(chr, start, end, gene = `Gene Name`) %>% 
  mutate(strand = '+') -> uniq_ctrl_peak_input_bed

uniq_ctrl_job <- rGREAT::submitGreatJob(uniq_ctrl_peak_input_bed, species = 'hg38', bg = bg_peak_input_bed)

uniq_ctrl_tb <- rGREAT::getEnrichmentTables(uniq_ctrl_job, download_by = 'tsv')

peak ranges

consensus_atac_boolean_anno %>% 
  filter(ctrl_R1.mLb.clN.bool == F) %>% 
  filter(`Gene Name` %in% filter(salmon.quant$aale$intra$deseq$de, padj < 0.05, log2FoldChange > 1.5)$gene) %>%
  filter(`Gene Name` %in% c(ifn_g, ifn_a)) %>% 
  separate(Annotation, sep = '\\(', into = c('annotation', NA)) %>% 
  filter(annotation == "promoter-TSS ") %>% 
  mutate(strand = '+') %>% 
  select(chr, start, end, strand, name = `Gene Name`) %>% 
  GenomicRanges::makeGRangesFromDataFrame() -> ifn_uniq_peak_ranges

names(ifn_uniq_peak_ranges) <- uniq_ifn_kras_peak_genes$`Gene Name`

consensus_atac_boolean_anno %>% 
  filter(ctrl_R1.mLb.clN.bool == T) %>%
  filter(`Gene Name` %in% filter(salmon.quant$aale$intra$deseq$de, padj < 0.05, log2FoldChange > 0.5)$gene) %>%
  filter(`Gene Name` %in% c(ifn_g, ifn_a), ! is.na(`Gene Name`), ! `Gene Name` %in% names(ifn_uniq_peak_ranges)) %>% 
  separate(Annotation, sep = '\\(', into = c('annotation', NA)) %>% 
  filter(annotation == "promoter-TSS ") %>% 
  mutate(strand = '+') %>% 
  select(chr, start, end, strand, name = `Gene Name`) %>% 
  GenomicRanges::makeGRangesFromDataFrame() -> other_uniq_peak_ranges

names(other_uniq_peak_ranges) <- consensus_atac_boolean_anno %>% 
                                    filter(ctrl_R1.mLb.clN.bool == T) %>%
  filter(`Gene Name` %in% filter(salmon.quant$aale$intra$deseq$de, padj < 0.05, log2FoldChange > 0.5)$gene) %>%
  filter(`Gene Name` %in% c(ifn_g, ifn_a), ! is.na(`Gene Name`), ! `Gene Name` %in% names(ifn_uniq_peak_ranges)) %>% 
  separate(Annotation, sep = '\\(', into = c('annotation', NA)) %>% 
  filter(annotation == "promoter-TSS ") %>% 
  select(`Gene Name`) %>% unlist() %>% unname()

DE promoters

salmon.quant$aale$intra$deseq$de %>% 
  filter(padj < 0.05, log2FoldChange > 1.5) %>% 
  select(gene_id) %>% unlist() %>% unname() -> de_list

gr <- rowRanges(salmon.quant$gxi)

gencode_de_gr <- gr[names(gr) %in% de_list]

de_promoter_regions <- promoters(gencode_de_gr, upstream = 1000, downstream = 200)

getSeq(BSgenome.Hsapiens.UCSC.hg38, de_promoter_regions) -> upreg_gencode_seqs


salmon.quant$aale$intra$deseq$de %>% 
  filter(padj < 0.05, log2FoldChange < -1.5,
         !gene %in% meta.data$trono.krab.znfs$gene) %>%
  select(gene_id) %>% unlist() %>% unname() -> dn_de_list

dn_gr <- gr[names(gr) %in% dn_de_list]

dn_promoter_regions <- promoters(dn_gr, upstream = 1000, downstream = 200)

getSeq(BSgenome.Hsapiens.UCSC.hg38, dn_promoter_regions) -> dnreg_gencode_seqs

TE ranges

build_ranges_lists = TRUE


if (file.exists(file.path(output.data.dir, 'intra.aale.te_ranges_list.rds')) &
    file.exists(file.path(output.data.dir, 'exo.aale.te_ranges_list.rds')) &
    file.exists(file.path(output.data.dir, 'hbec.aale.te_ranges_list.rds'))) {
  
  build_ranges_lists = FALSE
  
  intra.aale.te_ranges_list <- 
    readRDS(file.path(output.data.dir, 'intra.aale.te_ranges_list.rds'))
  exo.aale.te_ranges_list <- 
    readRDS(file.path(output.data.dir, 'exo.aale.te_ranges_list.rds'))
  hbec.aale.te_ranges_list <- 
    readRDS(file.path(output.data.dir, 'hbec.aale.te_ranges_list.rds'))

}

TE filters

if (file.exists(file.path(output.data.dir, 'rmsk_parsed_bed.rds'))) {
  
  rmsk_parsed_bed <- readRDS(file.path(output.data.dir, 'rmsk_parsed_bed.rds'))
  
} else {
  
  meta.data$ucsc.rmsk.tx.to.gene.fmt %>% 
  select(enst, gene, biotype) %>% 
  separate(enst, sep = '_', into = c(NA, NA, 'gene', 'position', NA, NA, 'strand', NA)) %>% 
  mutate(position = sub('range=', '', position),
         strand = sub('strand=', '', strand)) -> rmsk_bed

  rmsk_bed %>% 
    filter(grepl(':', position)) %>% 
    separate(position, sep = ':', into = c('chr', 'range')) %>% 
    separate(range, sep = '-', into = c('start', 'end')) -> rmsk_parsed_bed
  
  saveRDS(rmsk_parsed_bed, file.path(output.data.dir, 'rmsk_parsed_bed.rds'))
  
}



imap(list('aale_intra' = ucsc.rmsk.salmon.quant$aale$intra$deseq$de,
          'hbec_intra' = ucsc.rmsk.salmon.quant$hbec$lacz_v_v12$deseq$de,
          'aale_exo' = ucsc.rmsk.salmon.quant$aale$exo$deseq$de), function(x, x_name) {
         # ), function(x, x_name) {

           x %>% 
             filter(!gene %in% meta.data$gencode.35.tx.to.gene$gene,
                    padj < 0.05, log2FoldChange > 0.65, !grepl('\\)n', gene)) %>% 
             mutate(context = x_name)
         # }) %>% bind_rows() -> upreg_te
         }) -> upreg_te


imap(list('aale_intra' = ucsc.rmsk.salmon.quant$aale$intra$deseq$de,
          'h358_intra' = ucsc.rmsk.salmon.quant$hbec$lacz_v_v12$deseq$de,
          'aale_exo' = ucsc.rmsk.salmon.quant$aale$exo$deseq$de), function(x, x_name) {
           x %>% 
             filter(!gene %in% meta.data$gencode.35.tx.to.gene$gene,
                    padj < 0.05, log2FoldChange < -1, !grepl('\\)n', gene)) %>% 
             mutate(context = x_name)
         }) %>% bind_rows() -> dnreg_te



intra.rmsk.txi <- as.data.frame(assay(ucsc.rmsk.salmon.quant$aale$intra$txi, 'counts'))
exo.rmsk.txi <- as.data.frame(assay(ucsc.rmsk.salmon.quant$aale$exo$txi, 'counts'))
hbec.rmsk.txi <- as.data.frame(assay(ucsc.rmsk.salmon.quant$hbec$lacz_v_v12$txi, 'counts'))


te_filter <- function(txi_df) {
  
  rmsk.txi <- txi_df[!grepl('^ENST', rownames(txi_df)), ]

  rmsk.txi %>% 
    rownames_to_column('enst') %>% 
    select(enst, contains('kras')) %>% 
    filter_if(is.numeric, any_vars(. > 2)) %>% 
    separate(enst, sep = '_', into = c(NA, NA, 'gene', 'position', NA, NA, 'strand', NA)) %>% 
    mutate(position = sub('range=', '', position),
           strand = sub('strand=', '', strand)) %>% 
    separate(position, sep = ':', into = c('chr', 'tmp')) %>% 
    separate(tmp, sep = '-', into = c('start', 'end')) -> rmsk.txi.bed
  
  
  rmsk.txi.bed %>% 
    select(-strand) %>% 
    merge(rmsk_parsed_bed, by = c('gene', 'chr', 'start', 'end')) -> filter_rmsk_parsed_bed
  
  filter_rmsk_parsed_bed
  
}

intra.rmsk.txi_filter_parsed <- te_filter(intra.rmsk.txi)
exo.rmsk.txi_filter_parsed <- te_filter(exo.rmsk.txi)
hbec.rmsk.txi_filter_parsed <- te_filter(hbec.rmsk.txi)

Ranges Lists

if(build_ranges_lists == TRUE) {
  tf_list_generator <- function(upreg_te_list,
                                filter_te_list) {
  
    lapply(unique(upreg_te_list$gene), function(te) {
  
      rmsk_parsed_bed %>% 
        filter(gene %in% upreg_te_list$gene,
               gene == te) %>% 
        GenomicRanges::makeGRangesFromDataFrame() -> te_ranges
      
      names(te_ranges) <- rep_len(te, length(te_ranges))
      
      te_ranges
      
    }) -> upreg_te_ranges_list
    
    lapply(unique(filter_te_list$gene), function(te) { 
  
      filter_te_list %>% 
        filter(gene %in% upreg_te_list$gene,
               gene == te) %>% 
        GenomicRanges::makeGRangesFromDataFrame() -> te_ranges
      
      names(te_ranges) <- rep_len(te, length(te_ranges))
      
      te_ranges
      
    }) -> filtered_te_ranges_list
    
    lapply(unique(filter_te_list$biotype), function(bt) {
  
      rmsk_parsed_bed %>% 
        filter(gene %in% upreg_te_list$gene,
               biotype == bt) %>% 
        GenomicRanges::makeGRangesFromDataFrame() -> te_ranges
      
      names(te_ranges) <- rep_len(bt, length(te_ranges))
      
      te_ranges
      
    }) -> filtered_bt_ranges_list
    
    return(list('upreg' = upreg_te_ranges_list,
                'filter_gene' = filtered_te_ranges_list,
                'filter_biotype' = filtered_bt_ranges_list))
  }
  
  intra.aale.te_ranges_list <- tf_list_generator(upreg_te$aale_intra, intra.rmsk.txi_filter_parsed)
  exo.aale.te_ranges_list <- tf_list_generator(upreg_te$aale_exo, exo.rmsk.txi_filter_parsed)
  hbec.aale.te_ranges_list <- tf_list_generator(upreg_te$hbec_intra, hbec.rmsk.txi_filter_parsed)
  
  plot_te_bamSignal <- function(pos) {
    width(te_ranges_list[[pos]])
  
    median(width(te_ranges_list[[pos]]))
    
    te_ranges_list[[pos]][width(te_ranges_list[[pos]]) > 10, ] -> te_ranges_tmp
    
    resize(te_ranges_tmp, width = median(width(te_ranges_list[[pos]]))) -> te_ranges_tmp
    
    bamsignals::bamCoverage(kras_bam_path, 
                            te_ranges_tmp, paired.end = 'extend') -> kras_de_te_profile
    bamsignals::bamCoverage(ctrl_bam_path, 
                            te_ranges_tmp, paired.end = 'extend') -> ctrl_de_te_profile
    
    bamsignals::alignSignals(kras_de_te_profile) %>% 
      as.data.frame() %>% 
      mutate(position = row_number()) %>% 
      gather(prom, cov, -position) %>%
      mutate(cov = (cov/kras_norm_factor) * 1e6,
             condition = 'kras',
             context = 'de') -> kras_te_prom_df
    
    bamsignals::alignSignals(ctrl_de_te_profile) %>% 
      as.data.frame() %>% 
      mutate(position = row_number()) %>% 
      gather(prom, cov, -position) %>%
      mutate(cov = (cov/ctrl_norm_factor) * 1e6,
             condition = 'ctrl',
             context = 'de') -> ctrl_te_prom_df
      
    rbind(kras_te_prom_df, ctrl_te_prom_df) %>%
      ggplot(aes(position, cov, group = condition, color = condition)) +
        stat_summary(geom="ribbon", fun.data=mean_cl_normal, width=0.1, conf.int=0.95, 
                     fill='grey', alpha = 0.1, size = rel(0.05), show.legend = F)+
        # stat_summary(geom="line", fun=mean, linetype="dashed", alpha = 0.15, size = rel(0.05)) +
        stat_summary(geom="point", fun=mean, alpha = 1, size = rel(1)) +
        scale_color_manual(values = c(viridis(3)[1], viridis(3)[2])) +
        ylab('mean CPM (95% CI)') + xlab('position within insertion') +
        theme(legend.position = c(0.99, 0.99))
    
  }
  
  general_ame_tf_generator <- function(te_ranges_list) {
    
      lapply(te_ranges_list, function(te_ranges) {
      
      
        if(length(te_ranges) > 0 & length(te_ranges) < 6000) {
          
          names(te_ranges) %>% unique() -> name
          
          print(name)
          print(length(te_ranges))
          
          getSeq(BSgenome.Hsapiens.UCSC.hg38, te_ranges) -> te_seqs
    
          memes::runAme(te_seqs, control = 'shuffle') -> te_motif_enrich
          
          if (!is.null(te_motif_enrich)) {
                  te_motif_enrich %>%
            mutate(tmp = motif_id,
                   TE = name) %>%
            separate(tmp, sep = '_', into = c('TF', NA)) %>% 
            mutate(TF = str_replace(TF, pattern = '^ZN', replacement = 'ZNF'),
            TF = str_replace(TF, pattern = 'ZNFF', replacement = 'ZNF'),
            TF = str_replace(TF, pattern = '^NFAC', replacement = 'NFATC')) -> ame_tf
    
          ame_tf
          }
      }
    })
  }
  
  intra.aale.te_ranges_list$general_ame_tf <- general_ame_tf_generator(intra.aale.te_ranges_list$filter_gene)
  exo.aale.te_ranges_list$general_ame_tf <- general_ame_tf_generator(exo.aale.te_ranges_list$filter_gene)
  hbec.aale.te_ranges_list$general_ame_tf <- general_ame_tf_generator(hbec.aale.te_ranges_list$filter_gene)
  
  te_tf_plots <- function(ame_tf_list, upreg_te_list, de_list) {
    
    ame_tf_list %>% 
      bind_rows() %>% 
      filter(adj.pvalue < 0.05) %>% 
      separate(motif_id, sep = '[.]', into = c('TF', NA, NA, NA)) %>% 
      mutate(TF = str_replace(TF, pattern = '_HUMAN', replacement = '')) %>% 
      merge(de_list, by.x = 'TF', by.y = 'gene') %>% 
      merge(meta.data$te.families.clades, by.x = 'TE', by.y = 'gene') %>% 
      merge(upreg_te_list, by.x = 'TE', by.y = 'gene') -> general_ame_tf_to_plt
      
      
    general_ame_tf_to_plt %>% 
      filter(padj.x < 0.05) %>% 
      ggplot(aes(reorder(TE, -log2FoldChange.y), log2FoldChange.x)) + 
      geom_boxplot(outlier.colour = NA, fill = NA) +
      geom_point() +
      ggrepel::geom_text_repel(aes(label = ifelse(log2FoldChange.x < -4, TF, ''))) + 
      ylab('log2FoldChange') + 
      xlab('') + 
      theme(axis.text.x = element_text(angle = 40, hjust = 1)) -> target_plot
    
  
    general_ame_tf_to_plt %>% 
      select(TF, log2FoldChange.x, padj.x, TE, context) %>% 
      distinct() %>% 
      group_by(TF) %>% 
      summarise(log2FoldChange = median(log2FoldChange.x), 
                padj = median(padj.x), 
                TE = list(TE)) %>% 
      ggplot(aes(TE)) + 
      geom_bar() + 
      ggupset::scale_x_upset() + 
      ylab('shared TFs w/ present motifs') +
      geom_text(stat='count', aes(label=after_stat(count)), vjust = -0.17, size = txt.mm_to_pts(0.8)) -> upset_plot
    
    
    general_ame_tf_to_plt %>% 
      select(TF, log2FoldChange.x, padj.x, clade, context) %>% 
      distinct() %>% 
      group_by(TF) %>% 
      summarise(log2FoldChange = median(log2FoldChange.x), 
                padj = median(padj.x), 
                clade = list(clade)) %>% 
      mutate(gene_set = ifelse(TF %in% c(ifn_a, ifn_g), 'IFN', 'other')) %>% 
      mutate(gene_set = ifelse(TF %in% meta.data$trono.krab.znfs$gene, 'ZNF', gene_set)) %>%
      mutate(gene_set = ifelse(TF %in% msig.df$HALLMARK_KRAS_SIGNALING_UP, 'KRAS', gene_set)) %>%
      mutate(gene_set = factor(gene_set, levels = c('KRAS', 'IFN', 'ZNF', 'other'))) %>% 
      ggplot(aes(log2FoldChange, -log10(padj), color = gene_set)) + 
      geom_point(alpha = 0.4) + 
      scale_color_manual(values = c(viridis(4)[1], viridis(4)[2], viridis(4)[3], 'grey'), name = 'TF classification') +
      ggrepel::geom_text_repel(aes(label = ifelse(gene_set != 'other' | TF == 'FOS' | abs(log2FoldChange) > 2, TF, '')), 
                                   show.legend = F,
                                   max.overlaps = 20, 
                               size = txt.mm_to_pts(0.8)) + 
      theme(legend.position = c(0.99,0.99)) -> volcano_plot
    
    return(list('target_plt' = target_plot, 'upset_plt' = upset_plot, 'volcano_plt' = volcano_plot))
  }
  
  intra.aale.te_ranges_list$plts <- te_tf_plots(intra.aale.te_ranges_list$general_ame_tf, upreg_te$aale_intra, salmon.quant$aale$intra$deseq$de)
  exo.aale.te_ranges_list$plts <- te_tf_plots(exo.aale.te_ranges_list$general_ame_tf, upreg_te$aale_exo, salmon.quant$aale$intra$deseq$de)
  hbec.aale.te_ranges_list$plts <- te_tf_plots(hbec.aale.te_ranges_list$general_ame_tf, upreg_te$hbec_intra, salmon.quant$hbec$lacz_v_v12$deseq$de)
  
  znf_list_generator <- function(te_ranges_list) {
    te_ranges_list %>% 
      lapply(function(te_ranges) {
        
        
        if(length(te_ranges) > 0 & length(te_ranges) < 6000) {
          
          names(te_ranges) %>% unique() -> name
          
          print(name)
          print(length(te_ranges))
          
          getSeq(BSgenome.Hsapiens.UCSC.hg38, te_ranges) -> te_seqs
    
          memes::runAme(te_seqs, control = 'shuffle', 
                        database = file.path(output.data.dir, 'reference.data', 
                                'Hughes_ZNF_motifs.meme')) -> te_motif_enrich
            if (!is.null(te_motif_enrich)) {
  
              te_motif_enrich %>%
                mutate(tmp = motif_id,
                       TE = name) %>%
                separate(tmp, sep = '_', into = c('TF', NA)) -> ame_tf
        
              ame_tf
            }
        }
      }) 
  }
  
  intra.aale.te_ranges_list$znf_ame_tf <- znf_list_generator(intra.aale.te_ranges_list$filter_gene)
  exo.aale.te_ranges_list$znf_ame_tf <- znf_list_generator(exo.aale.te_ranges_list$filter_gene)
  hbec.aale.te_ranges_list$znf_ame_tf <- znf_list_generator(hbec.aale.te_ranges_list$filter_gene)
  
  te_znf_plots <- function(ame_znf_list, upreg_te_list, de_list) {
    
    ame_znf_list %>% 
      bind_rows() %>%
      mutate(TF = str_split_fixed(TF, pattern = '[.]', n = 5)[,1]) %>%
      filter(adj.pvalue < 0.05) %>% 
      merge(de_list, by.x = 'TF', by.y = 'gene') %>%
      merge(meta.data$te.families.clades, by.x = 'TE', by.y = 'gene') %>% 
      merge(upreg_te_list, by.x = 'TE', by.y = 'gene') -> tmp
      
      
    tmp %>% 
      filter(padj.x < 0.05) %>% 
      ggplot(aes(reorder(TE, -log2FoldChange.y), log2FoldChange.x, color = log2FoldChange.y)) + 
      # geom_boxplot(outlier.colour = NA, fill = NA, size = 0.5) +
      geom_point() +
      ggrepel::geom_text_repel(aes(label = ifelse(log2FoldChange.x < -2, TF, '')), size = txt.mm_to_pts(0.8), color = 'black') + 
      ylab('ZNF RNA log2FoldChange') + 
      xlab('putatively bound TE') + 
      scale_color_viridis_c(guide = guide_colorbar(title = 'TE log2FC')) +
      theme(axis.text.x = element_text(angle = 40, hjust = 1)) +
      facet_row(~clade, scales = 'free_x', space = 'free') -> target_plot
    
  
    tmp %>% 
      select(TF, log2FoldChange.x, padj.x, TE, context) %>% 
      distinct() %>% 
      group_by(TF) %>% 
      summarise(log2FoldChange = median(log2FoldChange.x), 
                padj = median(padj.x), 
                TE = list(TE)) %>% 
      ggplot(aes(TE)) + 
      geom_bar() + 
      ggupset::scale_x_upset() + 
      geom_text(stat='count', aes(label=after_stat(count)), vjust = -0.23, size = txt.mm_to_pts(0.8)) -> upset_plot
    
    
    tmp %>% 
      select(TF, log2FoldChange.x, padj.x, clade, context) %>% 
      distinct() %>% 
      group_by(TF) %>% 
      summarise(log2FoldChange = median(log2FoldChange.x), 
                padj = median(padj.x), 
                clade = list(clade)) %>% 
      mutate(gene_set = ifelse(TF %in% c(ifn_a, ifn_g), 'IFN', 'other')) %>% 
      mutate(gene_set = ifelse(TF %in% meta.data$trono.krab.znfs$gene, 'ZNF', gene_set)) %>%
      mutate(gene_set = ifelse(TF %in% msig.df$HALLMARK_KRAS_SIGNALING_UP, 'KRAS', gene_set)) %>%
      mutate(gene_set = factor(gene_set, levels = c('KRAS', 'IFN', 'ZNF', 'other'))) %>% 
      ggplot(aes(log2FoldChange, -log10(padj), color = gene_set)) + 
      geom_point(alpha = 0.4) + 
      scale_color_manual(values = c(viridis(4)[1], viridis(4)[2], viridis(4)[3], 'grey'), name = 'diff. exp. TF classification') +
      ggrepel::geom_text_repel(aes(label = ifelse(gene_set != 'other' & abs(log2FoldChange) < 1 | TF == 'FOS' | abs(log2FoldChange) > 3, TF, '')), 
                                   show.legend = F,
                                   max.overlaps = 20, size = txt.mm_to_pts(0.8)) + 
      theme(legend.position = c(0.99,0.99)) -> volcano_plot
    
    tmp %>% 
      filter(padj.x < 0.05) %>% 
      group_by(TE, clade) %>% 
      mutate(clade = factor(clade, levels = c('DNA', 'LINE', "LTR", 'Satellite', 'SINE'))) %>% 
      summarize(znf_log2FC = mean(log2FoldChange.x),
                te_log2FC = mean(log2FoldChange.y)) %>% 
      ggplot(aes(te_log2FC, znf_log2FC, label = TE, color = clade)) + 
      geom_point() +
      ggrepel::geom_text_repel(show.legend = F, size = txt.mm_to_pts(0.8)) + 
      geom_hline(yintercept = 0, linetype = 'dashed', alpha = 0.4) +
      geom_vline(xintercept = 0, linetype = 'dashed', alpha = 0.4) +
      scale_color_manual(values = te_alone_color_pal, name = 'TE Clade') +
      ylab('TE-targeting ZNFs avg. RNA log2FC') +
      xlab('putatively bound TE RNA log2FoldChange') +
      theme(legend.position = c(0.99,0.99)) -> scatter_plot
    
    return(list('de_list' = tmp, 'target_plt' = target_plot, 'upset_plt' = upset_plot, 'volcano_plt' = volcano_plot, 'scatter_plt' = scatter_plot))
  }
  
  intra.aale.te_ranges_list$znf_plts <- te_znf_plots(intra.aale.te_ranges_list$znf_ame_tf, upreg_te$aale_intra, salmon.quant$aale$intra$deseq$de)
  exo.aale.te_ranges_list$znf_plts <- te_znf_plots(exo.aale.te_ranges_list$znf_ame_tf, upreg_te$aale_exo, salmon.quant$aale$intra$deseq$de)
  hbec.aale.te_ranges_list$znf_plts <- te_znf_plots(hbec.aale.te_ranges_list$znf_ame_tf, upreg_te$hbec_intra, salmon.quant$hbec$lacz_v_v12$deseq$de)
  
  
  znf_score <- read_csv(file = file.path(output.data.dir, 'all.znf.te.score.csv'))
  
  znf_score %>% 
    dplyr::rename('gene' = '...1') -> znf_score
  
  
  znf_score_plt <- function(gencode_de_list, te_de_list) {
    
    znf_score %>% 
      gather(TE, score, -gene) %>% 
      group_by(gene) %>% 
      filter(score > 0) %>% 
      mutate(rank = dense_rank(-score)) %>% 
      filter(!grepl('Mamrep', TE)) %>% 
      separate(TE, sep = '\\/', into = c('clade', 'family', 'TE')) %>% 
      filter(gene %in% filter(gencode_de_list, padj < 0.05, 
                              log2FoldChange < -0.75)$gene) -> tmp
    
    tmp %>% 
      merge(te_de_list %>%
              filter(padj < 0.05, log2FoldChange > 0.5), by.x = 'TE', by.y = 'gene') %>% 
      merge(gencode_de_list, by = 'gene') %>% 
      filter(rank < 50) %>%
      ggplot(aes(TE, rank, color = log2FoldChange.y)) +
      geom_point() +
      scale_color_viridis_c(name = 'znf log2FC') +
      ggrepel::geom_text_repel(aes(label = ifelse(rank < 12, gene, '')), size = txt.mm_to_pts(0.8), color = 'black') +
      xlab('putatively bound TE') + ylab('binding score rank') +
      theme(legend.position = c(0.99,0.99),
            axis.text.x = element_text(angle = 40, hjust = 1)) +
      facet_row(~clade, scales = 'free_x', space = 'free')
    
  }
  
  intra.aale.te_ranges_list$znf_plts$znf_score <- znf_score_plt(salmon.quant$aale$intra$deseq$de, ucsc.rmsk.salmon.quant$aale$intra$deseq$de)
  exo.aale.te_ranges_list$znf_plts$znf_score <- znf_score_plt(salmon.quant$aale$intra$deseq$de, ucsc.rmsk.salmon.quant$aale$exo$deseq$de)
  hbec.aale.te_ranges_list$znf_plts$znf_score <- znf_score_plt(salmon.quant$hbec$lacz_v_v12$deseq$de, ucsc.rmsk.salmon.quant$hbec$lacz_v_v12$deseq$de)
  
  saveRDS(intra.aale.te_ranges_list, file.path(output.data.dir, 'intra.aale.te_ranges_list.rds'))
  saveRDS(exo.aale.te_ranges_list, file.path(output.data.dir, 'exo.aale.te_ranges_list.rds'))
  saveRDS(hbec.aale.te_ranges_list, file.path(output.data.dir, 'hbec.aale.te_ranges_list.rds'))
}

ATAC Coverage

library(bamsignals)
ctrl_norm_factor <- 43576060
kras_norm_factor <- 60276212

kras_bam_path <- file.path(output.data.dir, 'narrow_output.data/bwa/mergedLibrary/kras_R1.mLb.clN.sorted.bam')

ctrl_bam_path <- file.path(output.data.dir, 'narrow_output.data/bwa/mergedLibrary/ctrl_R1.mLb.clN.sorted.bam')


salmon.quant$aale$intra$deseq$de %>% 
  filter(padj < 0.05, log2FoldChange > 1.5, gene %in% c(ifn_g, ifn_a, jak_stat)) %>% 
  select(gene_id) %>% unlist() %>% unname() -> de_list

gr <- rowRanges(salmon.quant$gxi)

ifn_de_gr <- gr[names(gr) %in% de_list]

de_promoter_regions <- promoters(ifn_de_gr, upstream = 1000, downstream = 500)

bamsignals::bamCoverage(kras_bam_path, de_promoter_regions, paired.end = 'extend') -> kras_de_prom_profile
bamsignals::bamCoverage(ctrl_bam_path, de_promoter_regions, paired.end = 'extend') -> ctrl_de_prom_profile

bamsignals::alignSignals(kras_de_prom_profile) %>% 
  as.data.frame() %>% 
  mutate(position = row_number()) %>% 
  gather(prom, cov, -position) %>%
  mutate(position = position - 1000,
         cov = (cov/kras_norm_factor) * 1e6,
         condition = 'kras',
         context = 'de') -> kras_de_prom_df


bamsignals::alignSignals(ctrl_de_prom_profile) %>% 
  as.data.frame() %>% 
  mutate(position = row_number()) %>% 
  gather(prom, cov, -position) %>%
  mutate(position = position - 1000,
         cov = (cov/ctrl_norm_factor) * 1e6,
         condition = 'ctrl',
         context = 'de') -> ctrl_de_prom_df

gr <- rowRanges(salmon.quant$gxi)

salmon.quant$aale$intra$deseq$de %>% 
  filter(gene %in% meta.data$trono.krab.znfs$gene,
         padj < 0.05, log2FoldChange <= -4.5) %>% 
  select(gene_id) %>% unlist() %>% unname() -> znf_de_list

znf_de_gr <- gr[names(gr) %in% znf_de_list]

de_promoter_regions <- promoters(znf_de_gr, upstream = 1000, downstream = 500)

bamsignals::bamCoverage(kras_bam_path, de_promoter_regions, paired.end = 'extend') -> kras_de_prom_profile
bamsignals::bamCoverage(ctrl_bam_path, de_promoter_regions, paired.end = 'extend') -> ctrl_de_prom_profile

bamsignals::alignSignals(kras_de_prom_profile) %>% 
  as.data.frame() %>% 
  mutate(position = row_number()) %>% 
  gather(prom, cov, -position) %>%
  mutate(position = position - 1000,
         cov = (cov/kras_norm_factor) * 1e6,
         condition = 'kras',
         context = 'de') -> znf_kras_de_prom_df

bamsignals::alignSignals(ctrl_de_prom_profile) %>% 
  as.data.frame() %>% 
  mutate(position = row_number()) %>% 
  gather(prom, cov, -position) %>%
  mutate(position = position - 1000,
         cov = (cov/ctrl_norm_factor) * 1e6,
         condition = 'ctrl',
         context = 'de') -> znf_ctrl_de_prom_df


de_promoter_regions_for_meme <- promoters(ifn_de_gr, upstream = 300, downstream = 500) %>% 
  memes::get_sequence(genome = BSgenome.Hsapiens.UCSC.hg38)

salmon.quant$aale$intra$deseq$de %>%
  filter(padj > 0.5, gene %in% c(ifn_g, ifn_a)) %>%
  select(gene_id) %>% unlist() %>% unname() -> ns_list
ns_gr <- gr[names(gr) %in% ns_list]
ns_promoter_regions <- promoters(ns_gr, upstream = 300, downstream = 500) %>% 
  memes::get_sequence(genome = BSgenome.Hsapiens.UCSC.hg38)


ifn_ame_out <- memes::runAme(de_promoter_regions_for_meme, control = ns_promoter_regions)

ifn_ame_out %>% 
  mutate(tmp = motif_id) %>% 
  separate(tmp, sep = '_', into = c('TF', NA)) %>% 
  mutate(TF = str_replace(TF, pattern = '^ZN', replacement = 'ZNF'),
         TF = str_replace(TF, pattern = 'ZNFF', replacement = 'ZNF'),
         TF = str_replace(TF, pattern = '^NFAC', replacement = 'NFATC')) -> ifn_ame_out

ATAC BW signal

library(trackplot)
library(scales)

bigWigs <- c(file.path(output.data.dir, 'atac_out/bwa/mergedLibrary/bigwig/kras_R1.mLb.clN.bigWig'),
             file.path(output.data.dir, 'atac_out/bwa/mergedLibrary/bigwig/ctrl_R1.mLb.clN.bigWig'))

plot_big_wig_coverage <- function(loci, finish_plot = F, gene_name, y_lab = '', lab_tag = '') { 
  
  # loci <- paste0(chr, ':', start, '-', stop)
  
  chr <- str_split(loci, pattern = c(':'))[[1]][1]
  start <- as.numeric(str_replace_all(str_split(str_split(loci, pattern = c(':'))[[1]][2], c('-'))[[1]][1], ',' ,''))
  stop <- as.numeric(str_replace_all(str_split(str_split(loci, pattern = c(':'))[[1]][2], c('-'))[[1]][2], ',' ,''))
  
  track_data = track_extract(bigWigs = bigWigs, loci = loci, binsize = 1)

  track_data = track_summarize(summary_list = track_data, condition = c("KRAS", "CTRL"), stat = "mean")
  
  gene_model <- trackplot::.extract_geneModel_ucsc(chr = chr,
                                                    start = start,
                                                    end = stop,
                                                    refBuild = 'hg38')
    
  max_vector <- c(as.data.frame(track_data$KRAS)$max, as.data.frame(track_data$CTRL)$max)

  tx_tbl <- trackplot::.collapse_tx(gene_model)[[gene_name]]
  
  region_start <- min(as.data.frame(track_data$KRAS)$start)
  
  region_stop <- max(as.data.frame(track_data$KRAS)$end)
  
  first_exon_start <- tx_tbl[[1]][1]
  
  first_exon_stop <- tx_tbl[[2]][1]
  
  exon_start_vector <- tx_tbl[[1]]
  
  exon_end_vector <- tx_tbl[[2]]
  
  first_gene_name <- attr(tx_tbl, 'gene')
  
  start_stop_df <- data.frame(x = first_exon_stop,
                          start = exon_start_vector, 
                          end = exon_end_vector,
                          y = 0.5)

  gene_line_end <- ifelse(max(start_stop_df$end) >= region_stop, region_stop, max(start_stop_df$end))
  
  if (attr(tx_tbl, 'strand') == '+') {
    arrow_end = 'last'
    gene_line_start <- ifelse(min(start_stop_df$start) >= region_start, min(start_stop_df$start), region_start)

  } else {
    arrow_end = 'first'
    gene_line_start <- ifelse(min(start_stop_df$start) >= region_start, min(start_stop_df$start), region_start)

  }
  

  # break_step = (stop-start)/break_denom

  track_data$KRAS %>% 
    as.data.frame() %>% 
    ggplot(aes(start, max)) + 
    scale_x_continuous(labels = comma) +
    scale_y_continuous(limits = c(0, max(max_vector)), 
                       expand = c(0,0), breaks = get_breaks(n = 4)) +
    ylab(y_lab) + 
    theme(axis.line.x = element_blank(), 
        axis.text.x = element_blank(),
        axis.ticks.x = element_blank(), 
        plot.title = element_text(face = 'bold'),
        plot.margin=margin(b = 0,unit = "mm")) +
    xlab('') +
    ggtitle(first_gene_name) +
    geom_area(fill = alpha(viridis(3)[2], 1), color = alpha(viridis(3)[2], 1)) -> A.plt
  
  ggplot(data = start_stop_df, 
         aes(x = x, 
             xmin = start, 
             xend = end, 
             y = y, 
             yend = y)) + 
    scale_x_continuous(limits = c(region_start, region_stop)) +
    scale_y_continuous(limits = c(0,1)) +
    geom_segment(aes(x = gene_line_start, xend = gene_line_end), 
                 color = 'darkblue', size = 0.75, 
                 alpha = 0.1, arrow = arrow(ends = arrow_end)) +
    geom_segment(aes(x = start, xmin = start, xend = end, y = y, yend = y), 
                 color = 'darkblue', size = 12) +
    ylab('') + xlab('') +
    theme(axis.line.y = element_blank(), 
          axis.text.y = element_blank(),
          axis.ticks.y = element_blank(),
          axis.line.x = element_blank(), 
          axis.text.x = element_blank(),
          axis.ticks.x = element_blank(),
          plot.margin=margin(0,0,0,0,unit = "mm")) -> B.plt
  
  track_data$CTRL %>% 
    as.data.frame() %>% 
    ggplot(aes(start, max)) + 
    scale_y_continuous(limits = c(0, max(max_vector)), 
                      expand = c(0,0), breaks = get_breaks(n = 4)) +
    scale_x_continuous(labels = comma, breaks = get_breaks(n = 3)) +
    ylab('') + xlab(chr) +
    geom_area(fill = alpha(viridis(3)[1], 1), color = alpha(viridis(3)[1], 1)) +
    theme(axis.line.x = element_blank(),
          plot.margin=margin(t = 0,unit = "mm")) -> C.plt

  p4 <- ggplot(data.frame(l = A.plt$labels$y, x = 1, y = 1)) +
        geom_text(aes(x, y, label = l), angle = 90) + 
        theme_void() +
        coord_cartesian(clip = "off")
  
  if (finish_plot == T) {
    
      atac_layout <- "
      DA
      DB
      DC
      "
  
      wrap_plots(A.plt + ylab(''),
                 B.plt,
                 C.plt + ylab(''),
                 p4,
                 design = atac_layout, 
                 heights = c(0.85,0.15, 0.85),
                 widths = c(0.01, 1))
  } else {
    
      atac_layout <- "
      A
      B
      C
      "
  
      wrap_plots(A.plt + labs(tag = lab_tag),
                 B.plt,
                 C.plt,
                 design = atac_layout, 
                 heights = c(1,0.1, 1),
                 widths = c(1))
    
  }
  

  
}

# , y_lab = 'norm.\nread depth'

plot_big_wig_coverage(loci = 'chr14:24,158,954-24,163,321', gene_name = 'IRF9') -> irf9_atac.plt

plot_big_wig_coverage(loci = 'chr11:613,326-618,086', gene_name = 'IRF7') -> irf7_atac.plt

plot_big_wig_coverage(loci = 'chr14:94,109,404-94,112,652', gene_name = 'IFI27') -> ifi27_atac.plt

plot_big_wig_coverage(loci = 'chr12:112,978,011-112,986,730', gene_name = 'OAS2') -> oas2_atac.plt

plot_big_wig_coverage(loci = 'chr1:78,649,502-78,650,799', gene_name = 'IFI44') -> ifi44_atac.plt

plot_big_wig_coverage(loci = 'chr21:41,418,539-41,427,685', gene_name = 'MX1') -> mx1_atac.plt

ifn_atac_layout <- "
GABC
GDEF
"

p4 <- ggplot(data.frame(l = 'scaled read depth', x = 1, y = 1)) +
      geom_text(aes(x, y, label = l), angle = 90, size = txt.mm_to_pts(0.9)) + 
      theme_void() +
      coord_cartesian(clip = "off")
  
wrap_plots(irf9_atac.plt,
           irf7_atac.plt,
           ifi27_atac.plt,
           oas2_atac.plt,
           ifi44_atac.plt,
           mx1_atac.plt,
           p4,
           design = ifn_atac_layout,
           widths = c(0.1, 1, 1, 1)) -> ifn_atac_6.plt

plot_big_wig_coverage(loci = 'chr19:20,074,561-20,081,789', gene_name = 'ZNF90') -> znf90_atac.plt

plot_big_wig_coverage(loci = 'chr19:20,418,910-20,432,132', gene_name = 'ZNF826P') -> znf826p_atac.plt

plot_big_wig_coverage(loci = 'chr7:64,297,656-64,316,528', gene_name = 'ZNF736') -> znf736_atac.plt

plot_big_wig_coverage(loci = 'chr19:56,497,894-56,520,251', gene_name = 'ZNF471') -> znf471_atac.plt

plot_big_wig_coverage(loci = 'chr19:20,030,435-20,054,396', gene_name = 'ZNF682') -> znf682_atac.plt

plot_big_wig_coverage(loci = 'chr7:6,613,987-6,618,813', gene_name = 'ZNF853') -> znf853_atac.plt


znf_atac_layout <- "
GABC
GDEF
"
  
wrap_plots(znf90_atac.plt,
           znf826p_atac.plt,
           znf736_atac.plt,
           znf471_atac.plt,
           znf682_atac.plt,
           znf853_atac.plt,
           p4,
           design = znf_atac_layout,
           widths = c(0.1, 1, 1, 1)) -> znf_atac_6.plt

znf

salmon.quant$aale$intra$deseq$de %>% 
  filter(padj < 0.05, log2FoldChange < -1, gene %in% meta.data$trono.krab.znfs$gene | grepl('^ZNF', gene)) %>% 
  select(gene_id) %>% unlist() %>% unname() -> znf_de_list

salmon.quant$aale$intra$deseq$de %>% 
  filter(padj < 0.05, log2FoldChange > 1.5) %>% 
  select(gene_id) %>% unlist() %>% unname() -> up_de_list

gr <- rowRanges(salmon.quant$gxi)

znf_de_gr <- gr[names(gr) %in% znf_de_list]
up_de_gr <- gr[names(gr) %in% up_de_list]

consensus_atac_boolean_anno %>% 
  filter(ctrl_R1.mLb.clN.bool == T) %>%
  filter(`Gene Name` %in% filter(salmon.quant$aale$intra$deseq$de, padj < 0.05, 
                                 log2FoldChange < -1.5)$gene) %>%
  filter(`Gene Name` %in% meta.data$trono.krab.znfs$gene | grepl('^ZNF', `Gene Name`)) %>%
  separate(Annotation, sep = '\\(', into = c('annotation', NA)) %>% 
  filter(annotation == "promoter-TSS ") %>%
  # filter(chr != 'chr19') %>% 
  mutate(strand = '+') %>%
  select(chr, start, end, strand, name = `Gene Name`) %>% 
  GenomicRanges::makeGRangesFromDataFrame() -> ctrl_znf_uniq_peak_ranges

consensus_atac_boolean_anno %>% 
  filter(kras_R1.mLb.clN.bool == T) %>%
  filter(`Gene Name` %in% filter(salmon.quant$aale$intra$deseq$de, padj < 0.05, 
                                 log2FoldChange > 1.5)$gene) %>%
  filter(`Gene Name` %in% c(ifn_g, ifn_a)) %>%
  separate(Annotation, sep = '\\(', into = c('annotation', NA)) %>% 
  filter(annotation == "promoter-TSS ") %>%
  # filter(chr != 'chr19') %>% 
  mutate(strand = '+') %>%
  select(chr, start, end, strand, name = `Gene Name`) %>% 
  GenomicRanges::makeGRangesFromDataFrame() -> kras_ifn_uniq_peak_ranges

znf_promoter_seqs <-
  GenomicRanges::promoters(znf_de_gr, upstream = 1000, downstream = 500) %>%
  memes::get_sequence(genome = BSgenome.Hsapiens.UCSC.hg38)

upDe_promoter_seqs <-
  GenomicRanges::promoters(up_de_gr, upstream = 1000, downstream = 200) %>%
  memes::get_sequence(genome = BSgenome.Hsapiens.UCSC.hg38)

znf_uniq_peaks_seqs <- 
  memes::get_sequence(regions = ctrl_znf_uniq_peak_ranges, genome = BSgenome.Hsapiens.UCSC.hg38)

ifn_uniq_peaks_seqs <- 
  memes::get_sequence(regions = kras_ifn_uniq_peak_ranges, genome = BSgenome.Hsapiens.UCSC.hg38)

znf_ame_out <- memes::runAme(znf_promoter_seqs, 
                             control = upDe_promoter_seqs)

znf_ame_out %>% 
  mutate(tmp = motif_id) %>% 
  separate(tmp, sep = '_', into = c('TF', NA)) %>% 
  mutate(TF = str_replace(TF, pattern = '^ZN', replacement = 'ZNF'),
         TF = str_replace(TF, pattern = 'ZNFF', replacement = 'ZNF'),
         TF = str_replace(TF, pattern = '^NFAC', replacement = 'NFATC')) %>% 
  filter(adj.pvalue < 0.05) -> znf_ame_out

Figure 1

toil data load and transform

toil.survival <- read_tsv(file.path(output.data.dir, 'tcga.data/TCGA-LUAD.survival.tsv.gz')) 

luad.phenotypes <- 
  read_tsv(file.path(output.data.dir, 'tcga.data/TCGA-LUAD.GDC_phenotype.tsv.gz')) %>% 
  select(sample = submitter_id.samples, 
       sample_type = sample_type.samples,
       stage = tumor_stage.diagnoses,
       age = age_at_index.demographic,
       sex = gender.demographic,
       race = race.demographic)

luad.mutect <- read_tsv(file.path(output.data.dir, 'tcga.data/TCGA-LUAD.mutect2_snv.tsv.gz')) %>% 
  filter(gene %in% c('KRAS')) %>% 
  select(sample = Sample_ID, gene, aa_change = Amino_Acid_Change) %>% 
  pivot_wider(names_from = gene, values_from = aa_change) %>% 
  mutate(across(.cols = where(is.list), paste)) 

luad.phenotypes.merge <- luad.phenotypes %>% 
  merge(luad.mutect, by = 'sample', all.x = T) %>% 
  filter(sample_type %in% c('Primary Tumor', 'Solid Tissue Normal')) %>% 
  merge(toil.survival, by = 'sample') %>% 
  mutate(sample = str_sub(sample, end = str_length(sample) - 1)) %>% 
  replace_na(list(KRAS = 'WT')) %>% 
  mutate(KRAS = ifelse(sample_type == 'Solid Tissue Normal', 'WT_normal', KRAS))

norm.counts.final <- read_tsv(file.path(output.data.dir, 'tcga.data', 'luad.data.frame.tsv'))

norm.counts.final %>% 
  filter(sample != 'TCGA-44-3917-01') %>% 
  filter(KRAS %in% c('p.G12D', 'WT_normal')) -> working_toil_counts

complex plotting

plot_complex_heatmap <- function(filter_values, show_genes = F, split_vector = NULL, split_title = NULL) {
  
   set.seed(123)
  
  ## heatmap count matrix
  working_toil_counts %>% 
    filter(gene %in% filter_values) %>%
    group_by(gene) %>%
    mutate(count = scale(log1p(count), center = T)) %>%
    select(sample, gene, count) %>%
    pivot_wider(names_from = gene, values_from = count) %>%
    distinct() %>% 
    column_to_rownames('sample') %>% 
    t() -> toil_hm_counts
  
  ## heatmap mutation annotation 
  working_toil_counts %>% 
    select(sample, 'G12D' = KRAS) %>%
    distinct() %>% 
    column_to_rownames('sample') %>% 
    ComplexHeatmap::columnAnnotation(df = . , 
                                     col = list(G12D = c('WT_normal' = 'white', 
                                                         'p.G12D' = viridis(3)[2])),
                                      show_legend = F,
                                      annotation_name_side = 'left',
                                      annotation_name_gp = grid::gpar(fontsize = 8),
                                      simple_anno_size = unit(2.5, "mm")) -> toil_sample_anno
  ## heatmap DE barpot annotation
  working_toil_counts %>% 
    filter(gene %in% filter_values) %>%
    select(gene) %>% 
    distinct() %>% 
    merge(salmon.quant$aale$intra$deseq$de %>% select(gene, log2FoldChange), by = 'gene') %>% 
    column_to_rownames('gene') -> toil_de
  
  toil_de[order(match(rownames(toil_de), rownames(toil_hm_counts))), , drop =F] -> toil_de
  
  toil_de_list <- setNames(toil_de$log2FoldChange, rownames(toil_de))
  
  ComplexHeatmap::rowAnnotation(
    `AALE RNA log2FC` = ComplexHeatmap::anno_barplot(x = toil_de_list,
        bar_width = 1, 
        which = 'row',
        gp = grid::gpar(col = "black", fill = viridis(3)[2]), 
        border = FALSE,
        annotation_name_gp = grid::gpar(fontsize = 8),
        width = unit(1.25, "cm"),
        height = unit(0.75, "cm")), 
    show_annotation_name = TRUE,
    annotation_name_gp = grid::gpar(fontsize = 8)) -> log2fc_ha
  
  working_toil_counts %>% 
  filter(gene %in% filter_values) %>%
  select(gene) %>% 
  distinct() %>% 
  mutate(IRF9 = ifelse(gene %in% irf9_fimo_results$gene, T, F),
         IRF1 = ifelse(gene %in% irf1_fimo_results$gene, T, F),
         IRF7 = ifelse(gene %in% irf7_fimo_results$gene, T, F)) %>% 
  column_to_rownames('gene') %>% 
  ComplexHeatmap::rowAnnotation(df =.,
                                col = list(IRF9 = c('TRUE' = viridis(2)[1], 'FALSE' = 'white'),
                                           IRF1 = c('TRUE' = viridis(2)[1], 'FALSE' = 'white'),
                                           IRF7 = c('TRUE' = viridis(2)[1], 'FALSE' = 'white')),
                                show_legend = F,
                                annotation_name_gp = grid::gpar(fontsize = 8),
                                simple_anno_size = unit(2, "mm")) -> gene_motif_anno
  
  ComplexHeatmap::Heatmap(toil_hm_counts, 
                        name = 'TCGA\nz-score', 
                        show_row_names = show_genes,
                        show_column_names = F,
                        row_split = split_vector,
                        row_title = split_title,
                        row_title_gp = grid::gpar(fontsize = 7, face = 'bold'),
                        row_names_side = 'right',
                        row_dend_width = unit(3, 'mm'),
                        left_annotation = gene_motif_anno,
                        right_annotation = log2fc_ha,
                        top_annotation = toil_sample_anno,
                        heatmap_legend_param = 
                          list(labels_gp = grid::gpar(fontsize = 7),
                               legend_gp = grid::gpar(fontsize = 7),
                               legend_direction = 'vertical',
                               legend_height = unit(3.5, "cm"),
                               title_gp = grid::gpar(fontsize = 7, face = 'bold')),
                        row_names_gp = grid::gpar(fontsize = 7),
                        heatmap_width = patch_fig_width*0.875)
    
  
}

consensus_atac_boolean_anno %>% 
  filter(ctrl_R1.mLb.clN.bool == F) %>% 
  filter(`Gene Name` %in% filter(salmon.quant$aale$intra$deseq$de, padj < 0.05, log2FoldChange > 1.5)$gene) %>%
  # filter(`Gene Name` %in% c(ifn_g, ifn_a)) %>%
  separate(Annotation, sep = '\\(', into = c('annotation', NA)) %>% 
  filter(annotation == "promoter-TSS ") %>%
  # group_by(annotation) %>%
  group_by(`Gene Name`) %>%
  tally() -> aale_uniq_up

salmon.quant$aale$intra$deseq$de %>% 
  filter(padj < 0.05, log2FoldChange > 1.5, gene %in% c(ifn_a, ifn_g)) -> aale_ifn

unique_peak_aale_ifn_list <- unique(c(aale_uniq_up$`Gene Name`, aale_ifn$gene))

aale_ifn_chm <- plot_complex_heatmap(unique_peak_aale_ifn_list, show_genes = T, 
                                     split_vector = 2, split_title = "cluster_%s")

aale_ifn_chm_no_rownames <- aale_ifn_chm
aale_ifn_chm_no_rownames@row_names_param$show <- FALSE

split_rowOrder <- ComplexHeatmap::row_order(ComplexHeatmap::draw(aale_ifn_chm_no_rownames))

aale_ifn_chm <- plot_complex_heatmap(unique_peak_aale_ifn_list, show_genes = T)

aale_ifn_chm_rowOrder <- ComplexHeatmap::row_order(ComplexHeatmap::draw(aale_ifn_chm))

ifn_high_hm_zoom <- aale_ifn_chm[split_rowOrder[[1]], ]

patchwork

msig.df[grepl('INTERFERON', names(msig.df))] %>% 
                             unlist() %>% unname() -> ifn_gene_list

salmon.quant$aale$intra$deseq$de %>% 
  mutate(padj = ifelse(padj == 0, 
                       min(filter(salmon.quant$aale$intra$deseq$de, padj != 0)$padj), padj)) %>% 
  mutate(gene_set = ifelse(gene %in% ifn_gene_list,'IFN', 'other'),
         gene_set = ifelse(gene %in% msig.df$HALLMARK_KRAS_SIGNALING_UP | gene == 'KRAS', 'KRAS', gene_set),
         gene_set = ifelse(gene %in% meta.data$trono.krab.znfs$gene, 'ZNF', gene_set)) %>% 
  mutate(gene_set = factor(gene_set, levels = c('IFN','KRAS','ZNF', 'other'))) %>% 
  filter(padj < 0.05, gene_set != 'other') %>% 
  mutate(kras = ifelse(gene == 'KRAS', T, F)) %>% 
  ggplot(aes(log2FoldChange, -log10(padj), label = gene, color = kras)) + 
  geom_point(alpha = 0.6) + 
  ggrepel::geom_text_repel(aes(label = ifelse(abs(log2FoldChange) > 5 & gene_set != 'other' | 
                                                padj < 0.0001 & gene_set != 'other' | gene == "KRAS", gene, '')),
                           size = txt.mm_to_pts(0.8), segment.color = NA) +
  facet_col(~gene_set, scales = 'free_y') +
  xlab('RNA log2FoldChange') +
  scale_color_manual(values = c('black', 'red'), guide = 'none') +
  geom_vline(xintercept = 0, alpha = 0.3, linetype = 'dashed') -> plt_1A.plt

salmon.quant$aale$intra$deseq$H.fgsea %>% 
  arrange(NES) %>% 
  mutate(rank = row_number()) %>% 
  group_by(pathway) %>% 
  mutate(pathway = str_replace_all(pathway, 'HALLMARK ', ''),
         pathway = str_replace_all(pathway, ' ', '\n'),
         observed = length(unlist(leadingEdge)),
         NES = round(NES, 3)) %>% 
  # arrange(-NES, padj) %>% 
  filter(!grepl('ZNF', pathway)) %>% 
  ggplot(aes(NES, rank, color = -log10(padj))) + 
  geom_point(size = 4) +
  geom_segment(aes(x=0, xend=NES, y=rank, yend=rank)) +
  geom_text(aes(label = paste0(observed, '/', size), 
                x = NES - 0.75 * sign(NES)), 
            nudge_y = 0.15, 
            color = 'black', 
            size = txt.mm_to_pts(0.8)) + 
  geom_text(aes(label = pathway, 
              x = - .9 * sign(NES)), 
              nudge_y = 0.05, 
              color = 'black', 
              size = txt.mm_to_pts(0.8)) + 
  scale_color_viridis_c() +
  ylab('gene set') + xlab('GSEA NES') +
  geom_vline(xintercept = 0, alpha = 0.3, linetype = 'dashed') +
  theme(legend.position = c(0.99, 0.17),
        axis.ticks.y = element_blank(),
        axis.text.y = element_blank()) -> plt_1B.plt

salmon.quant$aale$intra$deseq$de %>% 
  dplyr::rename('AALE' = log2FoldChange) %>% 
  filter(padj < 0.05,
         gene %in% c(ifn_a, ifn_g)) %>% 
  merge(salmon.quant$hbec$lacz_v_v12$deseq$de %>% 
          dplyr::rename('HBEC' = log2FoldChange) %>% 
          filter(padj < 0.05) %>% 
          mutate(HBEC = HBEC), by = 'gene') %>% 
  ggplot(aes(AALE, HBEC)) +
  geom_point() + 
  xlab('AALE ISG RNA log2FC') + ylab('HBEC ISG RNA log2FC') +
  geom_hline(yintercept = 0, linetype = 'dashed', alpha = 0.4) + 
  geom_vline(xintercept = 0, linetype = 'dashed', alpha = 0.4) +
  geom_text_repel(aes(label = ifelse(AALE > 1 & HBEC > 1, gene, '')), 
                  size = txt.mm_to_pts(0.8),show.legend = F) + 
  theme(legend.position = c(0.99, 0.22)) -> plt_1C.plt

salmon.quant$aale$intra$deseq$de %>% 
  merge(ame_tf, by.x = 'gene', by.y = 'TF') %>% 
  ggplot(aes(log2FoldChange, -log10(padj), label = gene)) + 
  geom_point() + 
  geom_text_repel(color = 'black', size = txt.mm_to_pts(0.8)) + 
  geom_vline(xintercept = 0, alpha = 0.3, linetype = 'dashed') + 
  xlab('ISG-binding TF RNA log2FC') +
  theme(legend.position = c(0.35, 0.79)) -> plt_1D.plt

fig_1_patch <- "
AB
AB
CD
EE
"

wrap_plots(plt_1A.plt,
           plt_1B.plt,
           plt_1C.plt,
           plt_1D.plt,
           patchwork::wrap_elements(full = 
                                      grid::grid.grabExpr(ComplexHeatmap::draw(ifn_high_hm_zoom,
                                                                               heatmap_legend_side = 'right'), 
                                                                               width = patch_fig_width)), 
           heights = c(1,1,0.75,1.25),
           design = fig_1_patch) + 
  plot_annotation(tag_levels = 'A') &
  theme(plot.tag = element_text(size = rel(0.95), face = 'bold'),
        plot.tag.position = c(0.01, 0.99)) -> figure_1_patchwork.fig


ggsave2(plot = figure_1_patchwork.fig,
       filename = file.path(figure.dir, 'fig.1', 'figure_1_patchwork.pdf'),
       width = patch_fig_width, height = patch_fig_height, device = cairo_pdf, units = 'mm')

figure_1_patchwork.fig

Figure S1

salmon.quant$aale$intra$deseq$de %>% 
  mutate(padj = ifelse(padj == 0, 
                       min(filter(salmon.quant$aale$intra$deseq$de, padj != 0)$padj), padj)) %>% 
  mutate(biotype = ifelse(gene %in% meta.data$gencode.35.lnc.gene.names$gene, 'lncRNA', 'protein-coding')) %>% 
  mutate(biotype = factor(biotype, levels = c('protein-coding','lncRNA'))) %>% 
  filter(padj < 0.05) %>%
  ggplot(aes(log2FoldChange, -log10(padj), label = gene)) + 
  geom_point(alpha = 0.6) + 
  ggrepel::geom_text_repel(aes(label = ifelse(abs(log2FoldChange) > 5 |
                                                -log10(padj) > 250, gene, '')),
                           size = txt.mm_to_pts(0.8), segment.color = NA, force = 10, num.overlaps = 5) +
  facet_col(~biotype, scales = 'free') +
  xlab('RNA log2FoldChange') -> plt_S1A.plt

salmon.quant$aale$intra$deseq$pca$pca.out %>% 
  rownames_to_column('sample') %>% 
  ggplot(aes(PC1, PC2,
             fill = condition,
             color = condition)) + 
    geom_point(size = rel(2)) + 
    scale_size_continuous(range = c(0.1, 2.5), breaks = c(0,5,10,15,20)) + 
    xlab(paste('AALE in. PC1', 
               round(cumsum(salmon.quant$aale$intra$deseq$pca$pcs.props)[1], digits = 3), sep = ' ')) +
    ylab(paste('AALE in. PC2', round(cumsum(salmon.quant$aale$intra$deseq$pca$pcs.props)[2] - 
                                       cumsum(salmon.quant$aale$intra$deseq$pca$pcs.props)[1],
                            digits = 3), sep = ' ')) +
    scale_color_manual(values = c(viridis(3)[1], viridis(3)[2])) +
    geom_vline(xintercept = 0, linetype = 'dotted', alpha = 0.3, size = 0.25) +
    geom_hline(yintercept = 0, linetype = 'dotted', alpha = 0.3, size = 0.25) + 
    theme(legend.position = c(0.30, 0.45),
            legend.key.width = unit(0.125,'cm'),
            legend.key.height = unit(0.4, 'cm')) -> plt_S1B.plt

salmon.quant$hbec$lacz_v_v12$deseq$de %>% 
  mutate(padj = ifelse(padj == 0, 
                       min(filter(salmon.quant$hbec$lacz_v_v12$deseq$de, padj != 0)$padj), padj)) %>% 
  mutate(gene_set = ifelse(gene %in% ifn_gene_list,'HBEC-IFN', 'other'),
         gene_set = ifelse(gene %in% msig.df$HALLMARK_KRAS_SIGNALING_UP | gene == 'KRAS', 'HBEC-KRAS', gene_set),
         gene_set = ifelse(gene %in% meta.data$trono.krab.znfs$gene, 'HBEC-ZNF', gene_set)) %>% 
  mutate(gene_set = factor(gene_set, levels = c('HBEC-IFN','HBEC-KRAS','HBEC-ZNF', 'other'))) %>% 
  filter(padj < 0.05, gene_set != 'other') %>% 
  mutate(kras = ifelse(gene == 'KRAS', T, F)) %>% 
  ggplot(aes(log2FoldChange, -log10(padj), label = gene, color = kras)) + 
  geom_point(alpha = 0.6, shape = 24) + 
  ggrepel::geom_text_repel(aes(label = ifelse(abs(log2FoldChange) > 5 & gene_set != 'other' | 
                                                padj < 0.0001 & gene_set != 'other' | gene == "KRAS", gene, '')),
                           size = txt.mm_to_pts(0.8), segment.color = NA) +
  facet_col(~gene_set, scales = 'free_y') +
  scale_color_manual(values = c('black', 'red'), guide = 'none') +
  xlab('RNA log2FoldChange') +
  geom_vline(xintercept = 0, alpha = 0.3, linetype = 'dashed') -> plt_S1C.plt


fig_S1_patch <- "
AC
AC
BC
EE
"

wrap_plots(plt_S1A.plt,
           plt_S1B.plt,
           plt_S1C.plt,
           patchwork::wrap_elements(full = 
                          grid::grid.grabExpr(ComplexHeatmap::draw(aale_ifn_chm_no_rownames,
                                                                   heatmap_legend_side = 'right'), 
                                                                   width = patch_fig_width)), 
           design = fig_S1_patch,
           heights = c(0.75,0.75,0.75, 1.75)) + 
  plot_annotation(tag_levels = 'A') &
  theme(plot.tag = element_text(size = rel(0.95), face = 'bold'),
        plot.tag.position = c(0.01, 0.99)) -> figure_S1_patchwork.fig


ggsave(plot = figure_S1_patchwork.fig,
       filename = file.path(figure.dir, 'fig.1', 'figure_S1_patchwork.pdf'),
       width = patch_fig_width, height = patch_fig_height, device = cairo_pdf, units = 'mm')

figure_S1_patchwork.fig

Figure 2

rbind(kras_de_prom_df, ctrl_de_prom_df) %>% 
  mutate(condition = toupper(condition)) %>% 
  ggplot(aes(position, cov, group = condition, color = condition)) +
    stat_summary(geom="ribbon", fun.data=mean_cl_boot, width=0.1, conf.int=0.95, 
                 fill='grey', alpha = 0.1, size = rel(0.05), show.legend = F)+
    stat_summary(geom="point", fun=mean, alpha = 1, size = rel(1), show.legend = T) +
    scale_color_manual(values = c(viridis(3)[1], viridis(3)[2])) +
    xlab('distance to TSS') +
    ylab('mean CPM (95% CI)') + 
    xlim(-1000, 500) + 
    theme(legend.position = c(0.3, 0.99)) -> plt_2A.plt

salmon.quant$aale$intra$deseq$de %>% 
  merge(uniq_kras_peak_genes, by.x = 'gene', by.y = 'Gene Name') %>% 
  mutate(`unique peaks` = ifelse(gene %in% c(ifn_a, ifn_g), 'ISG', 'other')) %>% 
  ggplot(aes(log2FoldChange, -log10(padj), label = gene, color = `unique peaks`)) + 
  geom_point() +
  scale_color_manual(values = c('black', 'grey'), name = 'unique ATAC peak') +
  geom_text_repel(aes(label = ifelse(`unique peaks` == 'ISG' | log2FoldChange > 6, gene, '')), 
                  size = txt.mm_to_pts(0.8), show.legend = F, segment.color = NA) + 
  geom_vline(xintercept = 0, alpha = 0.3, linetype = 'dashed') + 
  xlab('RNA log2FoldChange')+
  theme(legend.position = c(0.40, 1)) -> plt_2B.plt


base::do.call(rbind, uniq_kras_great_tb) %>%
  group_by(Ontology) %>%
  top_n(30, -HyperFdrQ) %>% 
  filter(HyperFdrQ < 0.05,
         NumFgGenesHit > 1,
         grepl('interferon|virus|RNA|oligo|^ex', Desc),
         !grepl('part|poly', Desc)) %>%
  ggplot(aes(log2(RegionFoldEnrich), reorder(Desc, RegionFoldEnrich), label = Desc, color = -log10(HyperFdrQ))) +
  geom_point(aes(size = NumFgGenesHit)) +
  geom_text_repel(aes(label = Desc), 
                size = txt.mm_to_pts(0.8), show.legend = F, point.padding = unit(2, 'mm'), color = 'black') + 
  theme(axis.text.y = element_blank(),
        axis.ticks.y = element_blank(),
        plot.title = element_text(size = rel(0.7), face = 'plain'),
        legend.position = c(0.99,0.35), legend.direction = 'horizontal') +
  scale_x_continuous() +
  ylab('GO Term') + xlab('ATAC region GREAT log2(FoldEnrichment)') +
  scale_color_viridis_c() -> plt_2D.plt


fig_2_patch <- "
AB
CC
DD
"

wrap_plots(plt_2A.plt,
           plt_2B.plt,
           wrap_elements(full = ifn_atac_6.plt),
           plt_2D.plt,
           heights = c(0.75,1.5,0.75),
           design = fig_2_patch) +
  plot_annotation(tag_levels = 'A') -> figure_2_patchwork.fig


ggsave(plot = figure_2_patchwork.fig,
       filename = file.path(figure.dir, 'fig.2', 'figure_2_patchwork.pdf'),
       width = patch_fig_width, height = patch_fig_height, device = cairo_pdf, units = 'mm')

figure_2_patchwork.fig

Figure 3

kras.sight <- read_csv(file.path(output.data.dir, 'kras.nanosight.data.csv'), skip = 2,
                       col_names = c('size', 'r1', 'r2', 'r3', NA, NA, NA, NA)) %>% 
  mutate(condition = 'kras')
ctrl.sight <- read_csv(file.path(output.data.dir, 'ctrl.nanosight.data.csv'), skip = 2,
                       col_names = c('size', 'r1', 'r2', 'r3', NA, NA, NA, NA)) %>% 
  mutate(condition = 'ctrl')

nano_sight <- rbind(kras.sight, ctrl.sight)

nano_sight %>% 
  select(size, r1, r2, r3, condition) %>% 
  gather(rep, value, -condition, -size) %>% 
  filter(size <= 400) %>% 
  mutate(value = value/1e6,
         condition = paste0(toupper(condition), ' EV')) %>% 
  group_by(size, condition) %>% 
  summarize(y = mean(value),
            ymin = mean(value) - sd(value),
            ymax = mean(value) + sd(value)) %>% 
  ggplot(aes(size, y = y, ymin = ymin, ymax = ymax, fill = condition)) +
  geom_ribbon(alpha = 0.25) +
  geom_line(color = 'darkgrey') +
  scale_fill_manual(values = c(viridis(3)[1], viridis(3)[2]), guide = F) +
  stat_peaks(geom = 'text', span = 35, vjust = -0.5, y.label.fmt = '%#f') +
  facet_row(~condition) + 
  coord_cartesian(clip = 'off') +
  ylab('1e6 particles per mL') + xlab('nm') -> plt_3A.plt

salmon.quant$aale$exo$deseq$de %>% 
  dplyr::rename('EXO_aale' = log2FoldChange) %>% 
  filter(padj < 0.05) %>% 
  merge(salmon.quant$aale$intra$deseq$de %>% 
          dplyr::rename('INTRA_aale' = log2FoldChange) %>% 
          filter(padj < 0.05), by = 'gene') %>% 
  lm(EXO_aale ~ INTRA_aale, data = .) %>% 
  summary() -> exo_lm_out

b <- round(exo_lm_out$coefficients[1,1], 2)
m <- round(exo_lm_out$coefficients[2,1], 2)
r2 <- round(exo_lm_out$adj.r.squared, 2)

salmon.quant$aale$exo$deseq$de %>% 
  dplyr::rename('EXO_aale' = log2FoldChange) %>% 
  filter(padj < 0.05) %>% 
  merge(salmon.quant$aale$intra$deseq$de %>% 
          dplyr::rename('INTRA_aale' = log2FoldChange) %>% 
          filter(padj < 0.05), by = 'gene') %>% 
  mutate(unique = ifelse(gene %in% uniq_kras_peak_genes$`Gene Name`, 'KRAS', 
                         ifelse(gene %in% uniq_ctrl_peak_genes$`Gene Name`, 'CTRL', 'none'))) %>% 
  ggplot(aes(EXO_aale, INTRA_aale, color = unique)) + 
  geom_point() + 
  scale_color_manual(values = c(viridis(3)[1], viridis(3)[2], 'grey'), name = 'unique ATAC peak') +
  geom_hline(yintercept = 0, linetype = 'dashed', alpha = 0.4) + 
  geom_vline(xintercept = 0, linetype = 'dashed', alpha = 0.4) +
  geom_text_repel(aes(label = ifelse(abs(EXO_aale) > 1 & abs(INTRA_aale) > 1 | unique != 'none', gene, '')), 
                  size = txt.mm_to_pts(0.8), show.legend = F, segment.color = NA) +
  theme(legend.position = c(0.99, 0.23)) +
  geom_line(stat = 'smooth', method = 'lm', linetype = 'dashed', alpha = 0.8, color = 'purple') +
  annotate(geom = 'text', x = 2.35, y = -2, fontface = 'italic',
           label = paste0('y', ' ', ' = ',b,' + ',m,'x; \nadj. R2 = ', r2), 
           size = rel(3.75), alpha = 0.8) + 
  ylab('intracellular RNA log2FC') + xlab('extracellular RNA log2FC') -> plt_3B.plt

salmon.quant$aale$exo$deseq$de %>% 
  mutate(biotype = ifelse(gene %in% meta.data$gencode.35.lnc.gene.names$gene, 'lncRNA', 'coding')) %>%
  # filter(padj < 0.05) %>% 
  ggplot(aes(log2FoldChange, -log10(padj), label = gene, color = biotype)) + 
  geom_point(alpha = 0.6) + 
  scale_color_manual(values = c('black', 'grey')) +
  ggrepel::geom_text_repel(aes(label = ifelse(abs(log2FoldChange) > 1, gene, '')), size = txt.mm_to_pts(0.8),
                           show.legend = F) +
  geom_vline(xintercept = 0, alpha = 0.3, linetype = 'dashed') + 
  xlab('ex. RNA log2FoldChange') +
  theme(legend.position = c(0.25,0.99)) -> plt_3C.plt

salmon.quant$aale$intra$deseq$de %>% 
  filter(padj < 0.05, abs(log2FoldChange) > 0.5) %>% 
  mutate(context = ifelse(log2FoldChange > 0, 'in_up', 'in_dn')) %>% 
  select(gene, context) %>% 
  rbind(
    salmon.quant$aale$exo$deseq$de %>% 
    filter(padj < 0.05, abs(log2FoldChange) > 0.5) %>% 
    mutate(context = ifelse(log2FoldChange > 0, 'ex_up', 'ex_dn')) %>% 
    select(gene, context)
    ) %>% 
  group_by(gene) %>% 
  summarize(contexts = list(context)) %>% 
  ggplot(aes(contexts)) + 
  geom_bar() + 
  ggupset::scale_x_upset() +
  geom_text(stat='count', aes(label=after_stat(count)), vjust = -0.05, size = txt.mm_to_pts(0.8)) +
  xlab('') + ylab('overlapping DE genes') -> plt_3D.plt


salmon.quant$aale$exo$deseq$H.fgsea %>% 
  filter(pathway %in% salmon.quant$aale$intra$deseq$H.fgsea$pathway) %>% 
  mutate(context = 'ex.') %>% 
  rbind(salmon.quant$aale$intra$deseq$H.fgsea %>% 
          filter(pathway %in% salmon.quant$aale$exo$deseq$H.fgse$pathway) %>% 
          mutate(context = 'in.')) %>% 
  arrange(NES) %>% 
  mutate(rank = row_number()) %>% 
  group_by(pathway) %>% 
  mutate(pathway = str_replace_all(pathway, 'HALLMARK ', ''),
         observed = length(unlist(leadingEdge)),
         NES = round(NES, 3)) %>% 
  filter(!grepl('ZNF', pathway)) %>% 
  ggplot(aes(NES, context, color = -log10(padj))) + 
  geom_point(size = 4) +
  geom_segment(aes(x=0, xend=NES, y=context, yend=context)) +
  scale_color_viridis_c() +
  ylab('context') + xlab('GSEA NES') +
  ggforce::facet_col(~pathway, scales = 'free_y') +
  geom_vline(xintercept = 0, alpha = 0.3, linetype = 'dashed') +
  theme(legend.position = c(0.25, 0.99),
        legend.direction = 'vertical') -> plt_3E.plt

ucsc.rmsk.salmon.quant$aale$exo$deseq$de %>%
  filter(!gene %in% meta.data$gencode.35.tx.to.gene$gene,
         padj < 0.05) %>%
  merge(meta.data$te.families.clades, by = 'gene') %>% 
  ggplot(aes(log2FoldChange, -log10(padj), label = gene, color = clade)) + 
  geom_point(alpha = 0.6) + 
  # xlim(-2, 2) +
  scale_color_viridis_d(direction = -1) +
  ggrepel::geom_text_repel(aes(label = ifelse(abs(log2FoldChange) >= 1.15, gene, '')), show.legend = F) + 
  theme(legend.position = c(0.99,0.95)) +
  xlab('ex. TE RNA log2FoldChange')-> plt_3F.plt

fig_3_patch <- "
AB
CD
EF
"

wrap_plots(plt_3A.plt,
           plt_3C.plt,
           plt_3B.plt,
           wrap_elements(full = plt_3D.plt),
           plt_3E.plt,
           plt_3F.plt,
           design = fig_3_patch,
           heights = c(1, 1, 1)) + 
  plot_annotation(tag_levels = 'A')  -> figure_3_patchwork.fig


ggsave(plot = figure_3_patchwork.fig,
       filename = file.path(figure.dir, 'fig.3', 'figure_3_patchwork.pdf'),
       width = patch_fig_width, height = patch_fig_height, device = cairo_pdf, units = 'mm')
figure_3_patchwork.fig

Figure S2

salmon.quant$aale$exo$deseq$pca$pca.out %>% 
  rownames_to_column('sample') %>% 
  mutate(condition = toupper(condition)) %>% 
  ggplot(aes(PC1, PC2,
             fill = condition,
             color = condition)) + 
    geom_point(size = rel(2)) + 
    scale_size_continuous(range = c(0.1, 2.5), breaks = c(0,5,10,15,20)) + 
    xlab(paste('AALE ex. PC1', round(cumsum(salmon.quant$aale$exo$deseq$pca$pcs.props)[1], digits = 3), sep = ' ')) +
    ylab(paste('AALE ex. PC2', round(cumsum(salmon.quant$aale$exo$deseq$pca$pcs.props)[2] - 
                              cumsum(salmon.quant$aale$exo$deseq$pca$pcs.props)[1],
                            digits = 3), sep = ' ')) +
    scale_color_manual(values = c(viridis(3)[1], viridis(3)[2])) +
    geom_vline(xintercept = 0, linetype = 'dotted', alpha = 0.3, size = 0.25) +
    geom_hline(yintercept = 0, linetype = 'dotted', alpha = 0.3, size = 0.25) + 
    theme(legend.position = c(0.99, 0.30),
            legend.key.width = unit(0.125,'cm'),
            legend.key.height = unit(0.4, 'cm')) -> plt_S3A.plt

ucsc.rmsk.salmon.quant$aale$exo$deseq$counts %>% 
  filter(!gene %in% meta.data$gencode.35.tx.to.gene$gene) %>% 
  merge(meta.data$te.families.clades, by = 'gene') %>%
  filter(clade %in% c('DNA', "LINE",'SINE', 'LTR', 'Satellite')) %>% 
  mutate(clade = factor(clade, levels = c('DNA', 'LINE', 'LTR', 'Satellite', 'SINE'))) %>% 
  gather(sample, count, -gene, -clade, -family) %>% 
  separate(sample, sep = '[.]', into = c(NA, NA, NA, 'condition')) %>%
  ggplot(aes(condition, log1p(count))) + 
  geom_boxplot(outlier.colour = NA, fill = NA) +
  geom_jitter(alpha = 0.1, color = 'black', width = 0.15) + 
  facet_col(~clade, scales = 'free_y') +
  # scale_y_continuous(limits = c(0,13)) +
  stat_compare_means(method = 'wilcox', 
                     label = 'p.format', 
                     # label.y = 2, 
                     # label.x = 0.5,
                     method.args = list(alternative = 'greater'),
                     ref.group = 'ctrl') + 
  theme(strip.placement = 'inside') + 
  ylab('log(norm. count + 1)') +
  coord_cartesian(clip = 'off')-> plt_S3B.plt


ucsc.rmsk.salmon.quant$aale$exo$deseq$counts %>% 
  # filter(gene %in% meta.data$gencode.35.tx.to.gene$gene) %>% 
  merge(meta.data$te.families.clades, by = 'gene', all.x = T) %>% 
  mutate(clade = ifelse(is.na(clade), 'coding', clade),
         clade = ifelse(gene %in% meta.data$gencode.35.lnc.gene.names$gene, 'lncRNA', clade)) %>% 
  gather(sample, count, -family, -clade, -gene) %>% 
  filter(!grepl('^MT|^RP', gene)) %>% 
  mutate(condition = ifelse(grepl('ctrl', sample), 'ctrl', 'kras'),
         sample = str_replace(sample, 'rnaEASY.aale.', '')) %>% 
  group_by(sample, condition, clade) %>% 
  summarize(clade_counts = sum(count)) %>% 
  group_by(sample) %>% 
  mutate(frac = clade_counts/sum(clade_counts)) %>% 
  mutate(clade = factor(clade, levels = c('DNA', "LINE",'LTR', 'Satellite', 'SINE', 'lncRNA', 'coding'))) %>% 
  filter(clade %in% c('DNA', "LINE",'SINE', 'LTR', 'Satellite', 'coding', 'lncRNA')) %>% 
  ggplot(aes(sample, frac, fill = clade)) +
  geom_col(color = 'white') +
  # geom_hline(yintercept = seq(0,1,0.25), color = 'white', alpha = 0.6) +
  scale_fill_viridis_d(direction = -1) +
  facet_col(~condition, scales = 'free_x') +
  xlab('') +
  ylab('fraction of norm. counts') +
  ggtitle('extracellular') +
  theme(legend.position = 'top', legend.justification = 'center', 
        axis.text.x = element_blank(), axis.ticks.x = element_blank()) -> plt_S3C.plt

ucsc.rmsk.salmon.quant$aale$intra$deseq$counts %>% 
  # filter(gene %in% meta.data$gencode.35.tx.to.gene$gene) %>% 
  merge(meta.data$te.families.clades, by = 'gene', all.x = T) %>% 
  mutate(clade = ifelse(is.na(clade), 'coding', clade),
         clade = ifelse(gene %in% meta.data$gencode.35.lnc.gene.names$gene, 'lncRNA', clade)) %>% 
  gather(sample, count, -family, -clade, -gene) %>% 
  filter(!grepl('^MT|^RP', gene)) %>% 
  mutate(condition = ifelse(grepl('ctrl', sample), 'ctrl', 'kras'),
         sample = str_replace(sample, 'rnaEASY.aale.', '')) %>% 
  group_by(sample, condition, clade) %>% 
  summarize(clade_counts = sum(count)) %>% 
  group_by(sample) %>% 
  mutate(frac = clade_counts/sum(clade_counts)) %>% 
  mutate(clade = factor(clade, levels = c('DNA', "LINE",'LTR', 'Satellite', 'SINE', 'lncRNA', 'coding'))) %>% 
  filter(clade %in% c('DNA', "LINE",'SINE', 'LTR', 'Satellite', 'coding', 'lncRNA')) %>% 
  ggplot(aes(sample, frac, fill = clade)) +
  geom_col(color = 'white') +
  # geom_hline(yintercept = seq(0,1,0.25), color = 'white', alpha = 0.6) +
  scale_fill_viridis_d(direction = -1) +
  facet_col(~condition, scales = 'free_x') +
  xlab('') +
  ylab('fraction of norm. counts') +
  ggtitle('intracellular') +
  theme(legend.position = 'top', legend.justification = 'center', 
        axis.text.x = element_blank(), axis.ticks.x = element_blank()) -> plt_S3D.plt

# exo_biotype.df %>% 
#   mutate(context = 'extracellular') %>% 
#   rbind(intra_biotype.df %>% 
#           mutate(context = 'intracellular')) %>% 
#   ggplot(aes(context, frac, color = condition)) + 
#   geom_boxplot(outlier.colour = NA, fill = NA) + 
#   # geom_jitter(alpha = 0.6, width = 1) + 
#   facet_wrap(~clade, scales = 'free_y', ncol = 2) + 
#   xlab('') + ylab('fraction of norm. counts') + 
#   # stat_compare_means(method = 'wilcox', comparisons = list(c('intracellular', 'extracellular'))) +
#   scale_color_manual(values = c(viridis(3)[1], viridis(3)[2])) + 
#   theme(legend.position = c(0.99, 0.15)) -> plt_S3B.plt


fig_S3_patch <- "
AB
AB
CD
"

wrap_plots(plt_S3A.plt,
           plt_S3B.plt,
           plt_S3D.plt,
           plt_S3C.plt,
           design = fig_S3_patch) + 
  plot_annotation(tag_levels = 'A')  -> figure_S3_patchwork.fig


ggsave(plot = figure_S3_patchwork.fig,
       filename = file.path(figure.dir, 'fig.2', 'figure_S2_patchwork.pdf'),
       width = patch_fig_width, height = patch_fig_height, device = cairo_pdf, units = 'mm')

figure_S3_patchwork.fig

Figure S3

ucsc.rmsk.salmon.quant$aale$intra$deseq$de %>%
  filter(!gene %in% meta.data$gencode.35.tx.to.gene$gene,
         padj < 0.05) %>%
  mutate(context = 'AALE') %>% 
  rbind(ucsc.rmsk.salmon.quant$hbec$lacz_v_v12$deseq$de %>%
    filter(!gene %in% meta.data$gencode.35.tx.to.gene$gene,
           padj < 0.05) %>% 
      mutate(context = 'HBEC')) %>% 
  merge(meta.data$te.families.clades, by = 'gene') %>% 
  mutate(clade = factor(clade, levels = c('DNA', 'LINE', "LTR", 'Satellite', 'SINE'))) %>% 
  ggplot(aes(log2FoldChange, -log10(padj), label = gene, color = clade)) + 
  geom_point(alpha = 0.6) + 
  ggrepel::geom_text_repel(aes(label = ifelse(log2FoldChange > 0, gene, '')), 
                           show.legend = F, max.overlaps = 30, size = txt.mm_to_pts(0.8)) +
  scale_color_viridis_d(direction = -1, name = 'TE Clade') +
  facet_col(~context, scales = 'free_y') +
  xlab('RNA log2FoldChange') +
  xlim(-6,4) +
  geom_vline(xintercept = 0, alpha = 0.3, linetype = 'dashed') + 
  theme(legend.position = c(0.35,0.35)) -> plt_4A.plt

salmon.quant$aale$intra$deseq$de %>% 
  dplyr::rename('AALE' = log2FoldChange) %>% 
  filter(padj < 0.05,
         gene %in% meta.data$trono.krab.znfs$gene) %>% 
  merge(salmon.quant$hbec$lacz_v_v12$deseq$de %>% 
          dplyr::rename('HBEC' = log2FoldChange) %>% 
          filter(padj < 0.05) %>% 
          mutate(HBEC = HBEC), by = 'gene') %>% 
  merge(meta.data$trono.krab.znfs, by = 'gene') %>% 
  ggplot(aes(AALE, HBEC, color = ZNF)) + 
  geom_point() + 
  scale_color_viridis_d(name = 'ZNF type') +
  xlab('AALE log2FoldChange') + ylab('HBEC log2FoldChange') +
  geom_hline(yintercept = 0, linetype = 'dashed', alpha = 0.4) + 
  geom_vline(xintercept = 0, linetype = 'dashed', alpha = 0.4) +
  scale_x_continuous(limits = c(-3,3)) +
  scale_y_continuous(limits = c(-2,2)) +
  geom_text_repel(aes(label = ifelse(AALE < -1 & HBEC < -0.35, gene, '')), show.legend = F, size = txt.mm_to_pts(0.8), color = 'black') + 
  theme(legend.position = c(0.99, 0.99)) -> plt_4B.plt


plt_4D_patch <- "
AAA
BCD
EEE
"

p4 <- ggplot(data.frame(l = 'putatively bound TE RNA log2FoldChange', x = 1, y = 1)) +
      geom_text(aes(x, y, label = l), angle = 0, size = txt.mm_to_pts(0.8)) + 
      theme_void() +
      coord_cartesian(clip = "off")
  
wrap_elements(full = wrap_plots(guide_area(),
                              intra.aale.te_ranges_list$znf_plts$scatter_plt + theme(legend.direction = 'horizontal') + ggtitle('AALE in.') + xlab(''),
                              exo.aale.te_ranges_list$znf_plts$scatter_plt + theme(legend.direction = 'horizontal') + ggtitle('AALE ex.') + ylab('') + xlab(''),
                              hbec.aale.te_ranges_list$znf_plts$scatter_plt + theme(legend.direction = 'horizontal') + ggtitle('HBEC in.') + ylab('') + xlab(''), 
                              p4,
                              guides = 'collect', heights = c(0.05, 1, 0.005), design = plt_4D_patch)) -> plt_4D.plt


fig_4_patch <- "
AB
CD
EE
"

wrap_plots(plt_4A.plt,
           plt_4B.plt,
           intra.aale.te_ranges_list$znf_plts$target_plt + theme(legend.position = c(0.3,0.3)) + ggtitle('ZNF motif binding'),
           intra.aale.te_ranges_list$znf_plts$znf_score + theme(legend.position = c(0.99,0.99)) + ggtitle('ZNF binding score rank'),
           plt_4D.plt,
           design = fig_4_patch) + 
  plot_annotation(tag_levels = 'A')  -> figure_4_patchwork.fig


ggsave(plot = figure_4_patchwork.fig,
       filename = file.path(figure.dir, 'fig.3', 'figure_S3_patchwork.pdf'),
       width = patch_fig_width, height = patch_fig_height, device = cairo_pdf, units = 'mm')

figure_4_patchwork.fig

Figure S4

intra.aale.te_ranges_list$znf_plts$de_list %>% 
  select(TF, log2FoldChange.x, padj.x, clade, context) %>% 
  distinct() %>% 
  group_by(TF, clade) %>%  
  summarise(log2FoldChange = median(log2FoldChange.x), 
            padj = median(padj.x)) %>% 
  ggplot(aes(log2FoldChange, -log10(padj))) + 
  geom_point(alpha = 0.4) + 
  ggrepel::geom_text_repel(aes(label = ifelse(abs(log2FoldChange) > 1, TF, '')), 
                               show.legend = F,
                               max.overlaps = 20, size = txt.mm_to_pts(0.8)) + 
  theme(legend.position = 'none') +
  facet_col(~clade) +
  xlab('RNA log2FoldChange') +
  geom_vline(xintercept = 0, linetype = 'dashed', alpha = 0.4) +
  ggtitle('AALE in. DE ZNFs targeting TE clades') -> plt_S4A.plt

exo.aale.te_ranges_list$znf_plts$de_list %>% 
  select(TF, log2FoldChange.x, padj.x, clade, context) %>% 
  distinct() %>% 
  group_by(TF, clade) %>%  
  summarise(log2FoldChange = median(log2FoldChange.x), 
            padj = median(padj.x)) %>% 
  ggplot(aes(log2FoldChange, -log10(padj))) + 
  geom_point(alpha = 0.4) + 
  ggrepel::geom_text_repel(aes(label = ifelse(abs(log2FoldChange) > 2.5, TF, '')), 
                               show.legend = F,
                               max.overlaps = 20, size = txt.mm_to_pts(0.8)) + 
  theme(legend.position = 'none') +
  facet_col(~clade) +
  xlab('RNA log2FoldChange') +
  geom_vline(xintercept = 0, linetype = 'dashed', alpha = 0.4) +
  ggtitle('AALE ex. DE ZNFs targeting TE clades') -> plt_S4B.plt

hbec.aale.te_ranges_list$znf_plts$de_list %>% 
  select(TF, log2FoldChange.x, padj.x, clade, context) %>% 
  distinct() %>% 
  group_by(TF, clade) %>%  
  summarise(log2FoldChange = median(log2FoldChange.x), 
            padj = median(padj.x)) %>% 
  ggplot(aes(log2FoldChange, -log10(padj))) + 
  geom_point(alpha = 0.4) + 
  ggrepel::geom_text_repel(aes(label = ifelse(abs(log2FoldChange) > 1, TF, '')), 
                               show.legend = F,
                               max.overlaps = 20, size = txt.mm_to_pts(0.8)) + 
  theme(legend.position = 'none') +
  facet_col(~clade) +
  xlab('RNA log2FoldChange') +
  geom_vline(xintercept = 0, linetype = 'dashed', alpha = 0.4) +
  ggtitle('HBEC in. DE ZNFs targeting TE clades') -> plt_S4C.plt

fig_S4_patch <- "
AD
BD
CE
"

wrap_plots(plt_S4A.plt,
           plt_S4B.plt,
           plt_S4C.plt,
           wrap_elements(full = intra.aale.te_ranges_list$plts$upset_plt),
           plot_spacer(),
           design = fig_S4_patch) + 
  plot_annotation(tag_levels = 'A')  -> figure_S4_patchwork.fig


ggsave2(plot = figure_S4_patchwork.fig,
       filename = file.path(figure.dir, 'fig.4', 'figure_S4_patchwork.pdf'),
       width = patch_fig_width, height = patch_fig_height, device = cairo_pdf, units = 'mm')

figure_S4_patchwork.fig

Figure S5

a549_de$de_682 %>% 
  mutate(oe = 'A549 ZNF682 Overexpression') %>% 
  rbind(a549_de$de_257 %>% 
    mutate(oe = 'A549 ZNF257 Overexpression')) %>% 
  merge(salmon.quant$aale$intra$deseq$de, by = 'gene_id') %>% 
  filter(gene %in% c(ifn_a, ifn_g),
         padj.x < 0.05) %>%
  ggplot(aes(log2FoldChange.x, log2FoldChange.y, label = gene)) + 
  geom_point(alpha = 0.6) + 
  ggrepel::geom_text_repel(aes(label = ifelse(log2FoldChange.x < -0.1 & log2FoldChange.y > 0.1, gene, '')), 
                           max.overlaps = 5) +
  facet_col(~oe) +
  geom_vline(xintercept = 0, alpha = 0.3, linetype = 'dashed') + 
  geom_hline(yintercept = 0, alpha = 0.3, linetype = 'dashed') + 
  ylab('<--- WT   AALE log2FC   G12D --->') + xlab('A549 log2FC') -> plt_5C.plt


a549_te_de$de_682 %>% 
  mutate(oe = 'A549 ZNF682 Overexpression') %>% 
  rbind(a549_te_de$de_257 %>% 
    mutate(oe = 'A549 ZNF257 Overexpression')) %>% 
  filter(!gene_id %in% meta.data$gencode.35.tx.to.gene$gene,
         !grepl(')n', gene_id),
         padj < 0.05) %>% 
  merge(meta.data$te.families.clades, by.x = 'gene_id', by.y = 'gene') %>% 
  ggplot(aes(log2FoldChange, -log10(padj), label = gene_id, color = clade)) + 
  geom_point(alpha = 0.6) + 
  geom_vline(xintercept = 0, linetype = 'dashed', alpha = 0.5) +
  scale_color_viridis_d(direction = -1) +
  ggrepel::geom_text_repel(aes(label = ifelse(log2FoldChange < -0.5, gene_id, '')), 
                           show.legend = F) +
  facet_col(~oe) + 
  xlab('RNA log2FoldChange') +
  theme(legend.position = c(0.3, 0.99)) -> plt_5D.plt

fig_5_patch <- "
CD
CD
EE
"

wrap_plots(plt_5C.plt,
           plt_5D.plt,
           plot_spacer(),
           design = fig_5_patch) + 
  plot_annotation(tag_levels = 'A')  -> figure_5_patchwork.fig


ggsave(plot = figure_5_patchwork.fig,
       filename = file.path(figure.dir, 'fig.5', 'figure_S5_patchwork.pdf'),
       width = patch_fig_width, height = patch_fig_height, device = cairo_pdf, units = 'mm')

figure_5_patchwork.fig

Figure S6

fig_S5_patch <- "
AA
BB
"

wrap_plots(exo.aale.te_ranges_list$znf_plts$znf_score + theme(legend.position = c(0.99,0.99)) + ggtitle('AALE ex. binding score rank'),
           hbec.aale.te_ranges_list$znf_plts$znf_score + theme(legend.position = c(0.99,0.99)) + ggtitle('HBEC in. binding score rank'),
           design = fig_S5_patch) + 
  plot_annotation(tag_levels = 'A')  -> figure_S5_patchwork.fig


ggsave(plot = figure_S5_patchwork.fig,
       filename = file.path(figure.dir, 'fig.6', 'figure_S6_patchwork.pdf'),
       width = patch_fig_width, height = patch_fig_height, device = cairo_pdf, units = 'mm')

figure_S5_patchwork.fig

Figure 4

rbind(znf_kras_de_prom_df, znf_ctrl_de_prom_df) %>%
  ggplot(aes(position, cov, group = condition, color = condition)) +
    stat_summary(geom="ribbon", fun.data=mean_cl_boot, width=0.1, conf.int=0.95, 
                 fill='grey', alpha = 0.1, size = rel(0.05), show.legend = F)+
    stat_summary(geom="point", fun=mean, alpha = 1, size = rel(1)) +
    scale_color_manual(values = c(viridis(3)[1], viridis(3)[2])) +
    xlab('distance to TSS') +
    ylab('mean CPM (95% CI)') + 
    xlim(-1000, 500) + 
    theme(legend.position = c(0.30, 0.99)) -> plt_6A.plt

salmon.quant$aale$intra$deseq$de %>% 
  merge(uniq_ctrl_peak_genes, by.x = 'gene', by.y = 'Gene Name') %>% 
  mutate(`unique peaks` = ifelse(gene %in% meta.data$trono.krab.znfs$gene | grepl('ZNF', gene), 'ZNF', 'other')) %>% 
  ggplot(aes(log2FoldChange, -log10(padj), label = gene, color = `unique peaks`)) + 
  geom_point() +
  scale_color_manual(values = c('grey', 'black'), name = 'unique ATAC peak') +
  geom_text_repel(aes(label = ifelse(`unique peaks` == 'ZNF' | abs(log2FoldChange) > 4, gene, '')), 
                  size = txt.mm_to_pts(0.8), show.legend = F, max.overlaps = 30, segment.color = NA) + 
  geom_vline(xintercept = c(0), alpha = c(0.3), linetype = 'dashed') +
  xlab('RNA log2FoldChange') +
  theme(legend.position = c(0.40, 0.99)) -> plt_6B.plt

salmon.quant$aale$intra$deseq$de %>% 
  merge(znf_ame_out, by.x = 'gene', by.y = 'TF') %>% 
  mutate(gene_set = ifelse(gene %in% ifn_gene_list,'IFN', 'other'),
       gene_set = ifelse(gene %in% msig.df$HALLMARK_KRAS_SIGNALING_UP, 'KRAS', gene_set),
       gene_set = ifelse(gene %in% meta.data$trono.krab.znfs$gene | grepl('^ZNF', gene), 'ZNF', gene_set)) %>% 
  mutate(gene_set = factor(gene_set, levels = c('KRAS','IFN','ZNF', 'other'))) %>% 
  ggplot(aes(log2FoldChange, -log10(padj), color = gene_set, label = gene)) + 
  geom_point() + 
  scale_color_viridis_d(name = 'TF classification') +
  xlab('DE TF RNA log2FoldChange') +
  xlim(-2.5,2.5)+
  geom_text_repel(aes(label = ifelse(abs(log2FoldChange) > 0, gene, '')), color = 'black') + 
  geom_vline(xintercept = 0, alpha = 0.3, linetype = 'dashed') + 
  theme(legend.position = 'top') -> plt_6D.plt


fig_6_patch <- "
AB
CC
DD
"

wrap_plots(plt_6A.plt,
           plt_6B.plt,
           wrap_elements(full = znf_atac_6.plt),
           plt_6D.plt,
           # plt_6E.plt,
           heights = c(0.75,1.5,0.75),
           design = fig_6_patch) +
  plot_annotation(tag_levels = 'A') -> figure_6_patchwork.fig


ggsave(plot = figure_6_patchwork.fig,
       filename = file.path(figure.dir, 'fig.4', 'figure_4_patchwork.pdf'),
       width = patch_fig_width, height = patch_fig_height, device = cairo_pdf, units = 'mm')

figure_6_patchwork.fig

Figure 5

complex plotting

plot_complex_heatmap <- function(filter_values, show_genes = F, split_vector = NULL, split_title = NULL) {
  
   set.seed(123)
  
  ## heatmap count matrix
  working_toil_counts %>% 
    filter(gene %in% filter_values) %>%
    group_by(gene) %>%
    mutate(count = scale(log1p(count), center = T)) %>%
    select(sample, gene, count) %>%
    pivot_wider(names_from = gene, values_from = count) %>%
    distinct() %>% 
    column_to_rownames('sample') %>% 
    t() -> toil_hm_counts
  
  ## heatmap mutation annotation 
  working_toil_counts %>% 
    select(sample, 'G12D' = KRAS) %>%
    distinct() %>% 
    column_to_rownames('sample') %>% 
    ComplexHeatmap::columnAnnotation(df = . , 
                                     col = list(G12D = c('WT_normal' = 'white', 
                                                         'p.G12D' = viridis(3)[2])),
                                      show_legend = F,
                                      annotation_name_side = 'right',
                                      annotation_name_gp = grid::gpar(fontsize = 8),
                                      simple_anno_size = unit(2.5, "mm")) -> toil_sample_anno
  ## heatmap DE barpot annotation
  working_toil_counts %>% 
    filter(gene %in% filter_values) %>%
    select(gene) %>% 
    distinct() %>% 
    merge(salmon.quant$aale$intra$deseq$de %>% select(gene, log2FoldChange), by = 'gene') %>% 
    column_to_rownames('gene') -> toil_de
  
  toil_de[order(match(rownames(toil_de), rownames(toil_hm_counts))), , drop =F] -> toil_de
  
  toil_de_list <- setNames(toil_de$log2FoldChange, rownames(toil_de))
  
  ComplexHeatmap::rowAnnotation(
    `AALE RNA log2FC` = ComplexHeatmap::anno_barplot(x = toil_de_list,
        bar_width = 1, 
        which = 'row',
        gp = grid::gpar(col = "black", fill = viridis(3)[2]), 
        border = FALSE,
        annotation_name_gp = grid::gpar(fontsize = 8),
        width = unit(1.25, "cm"),
        height = unit(0.75, "cm")), 
    show_annotation_name = TRUE,
    annotation_name_gp = grid::gpar(fontsize = 8)) -> log2fc_ha
  
  working_toil_counts %>%
  filter(gene %in% filter_values) %>%
  select(gene) %>%
  distinct() %>%
  mutate(uniq_peak = ifelse(gene %in% znf_unique_peaks$name, T, F)) %>%
  column_to_rownames('gene') %>%
  ComplexHeatmap::rowAnnotation(df =.,
                                col = list(uniq_peak = c('TRUE' = viridis(2)[1], 'FALSE' = 'white')),
                                show_legend = F, annotation_name_side = 'top',
                                annotation_name_gp = grid::gpar(fontsize = 8),
                                simple_anno_size = unit(2, "mm")) -> gene_motif_anno
  
  ComplexHeatmap::Heatmap(toil_hm_counts, 
                        name = 'TCGA\nz-score', 
                        show_row_names = show_genes,
                        show_column_names = F,
                        row_split = split_vector,
                        row_title = split_title,
                        row_title_gp = grid::gpar(fontsize = 7, face = 'bold'),
                        row_names_side = 'right',
                        row_dend_width = unit(3, 'mm'),
                        left_annotation = gene_motif_anno,
                        right_annotation = log2fc_ha,
                        top_annotation = toil_sample_anno,
                        heatmap_legend_param = 
                          list(labels_gp = grid::gpar(fontsize = 7),
                               legend_gp = grid::gpar(fontsize = 7),
                               legend_direction = 'vertical',
                               legend_height = unit(3.5, "cm"),
                               title_gp = grid::gpar(fontsize = 7, face = 'bold')),
                        row_names_gp = grid::gpar(fontsize = 7),
                        heatmap_width = patch_fig_width*0.875)
    
  
}

consensus_atac_boolean_anno %>% 
  filter(ctrl_R1.mLb.clN.bool == T) %>%
  filter(`Gene Name` %in% filter(salmon.quant$aale$intra$deseq$de, padj < 0.05, 
                                 log2FoldChange < -1.5)$gene) %>%
  filter(`Gene Name` %in% meta.data$trono.krab.znfs$gene | grepl('^ZNF', `Gene Name`)) %>%
  separate(Annotation, sep = '\\(', into = c('annotation', NA)) %>% 
  filter(annotation == "promoter-TSS ") %>%
  mutate(strand = '+') %>%
  select(chr, start, end, strand, name = `Gene Name`) -> znf_unique_peaks

salmon.quant$aale$intra$deseq$de %>% 
  filter(padj < 0.05, log2FoldChange < -1, gene %in% meta.data$trono.krab.znfs$gene | grepl('ZNF', gene)) -> aale_znf

aale_znf_chm <- plot_complex_heatmap(aale_znf$gene, show_genes = T, split_vector = NULL, split_title = NULL)

working_toil_counts %>% 
  filter(gene %in% c(aale_znf$gene)) %>%
  group_by(gene) %>% 
  mutate(count = scale(log1p(count), center = T)) %>% 
  select(sample, count, gene, KRAS) %>% 
  group_by(gene) %>% 
  summarize(w = wilcox.test(count ~ KRAS)$p.value) %>% 
  filter(w < 0.05) %>% 
  pull(gene) -> toil_sig_znf

working_toil_counts %>% 
  filter(gene %in% toil_sig_znf) %>%
  group_by(gene) %>% 
  mutate(count = scale(log1p(count), center = T)) %>% 
  rename('KRAS' = 'G12D') %>% 
  ggplot(aes(gene, count, color = G12D)) + 
  geom_boxplot(fill = NA, outlier.colour = NA) + 
  geom_point(alpha = 0.1, position = position_jitterdodge(jitter.width = 0.25)) +
  ylab('z-score') + xlab('') + 
  scale_color_manual(values = c(viridis(3)[2], viridis(3)[1]), name = 'KRAS') + 
  theme(axis.text.x = element_text(angle = 40, hjust = 1),
        legend.position = c(0.99, 0.25)) -> plt_7B.plt

znf.avg.counts.tidy.stratified <-
  norm.counts.final %>%
  filter(sample_type == 'Primary Tumor') %>%
  filter(gene %in% toil_sig_znf) %>%
  select(sample, count) %>% 
  group_by(sample) %>%
  summarize(avg.count = mean(count)) %>%
  mutate(gene.set = 'aale_znf_signal') %>%
  group_by(gene.set) %>%
  mutate(stratum = ifelse(avg.count >= quantile(avg.count, 0.66),
                          'top-third', ifelse(avg.count <= quantile(avg.count, 0.33),
                                              'bottom-third', 'middle'))) %>%
  filter(stratum != 'middle') %>%
  spread(gene.set, stratum)

surv.obj <- Surv(time = filter(luad.phenotypes.merge %>% select(sample, OS.time, OS) %>% distinct() , 
                               sample %in% znf.avg.counts.tidy.stratified$sample)$OS.time,
                 event = filter(luad.phenotypes.merge %>% select(sample, OS.time, OS) %>% distinct(), 
                                sample %in% znf.avg.counts.tidy.stratified$sample)$OS)

surv.fit <- survfit(as.formula(surv.obj ~ znf.avg.counts.tidy.stratified$aale_znf_signal), 
                    data = znf.avg.counts.tidy.stratified)
ggsurvplot(surv.fit,
           pval = T, 
           legend.title = "",
           ggtheme = theme_set_fun() + theme(legend.position = c(0.45,0.95)),
           pval.coord = c(0, 0.08),
           palette = viridis(3)[c(2,1)]) -> surv.plt

model.plt <- magick::image_read_pdf(path = file.path(figure.dir, 'fig.5','aale_model_cropped.pdf'))

fig_7_patch <- "
AA
BB
CD
"

wrap_plots(patchwork::wrap_elements(full = 
                                      grid::grid.grabExpr(ComplexHeatmap::draw(aale_znf_chm,
                                                                 heatmap_legend_side = 'right'), 
                                                                 width = patch_fig_width)),
           plt_7B.plt,
           wrap_elements(full = surv.plt$plot),
           wrap_elements(full = magick::image_ggplot(model.plt)),
           heights = c(1.5,0.75,0.75),
           design = fig_7_patch) +
  plot_annotation(tag_levels = 'A') -> figure_7_patchwork.fig


ggsave(plot = figure_7_patchwork.fig,
       filename = file.path(figure.dir, 'fig.5', 'figure_5_patchwork.pdf'),
       width = patch_fig_width, height = patch_fig_height, device = cairo_pdf, units = 'mm')

figure_7_patchwork.fig